sync_inode - write an inode and its pages to disk.
* @inode: the inode to sync
The function is responsible to synchronize all dirty inodes belongings to a given superblock
task = kthread_create(bdi_writeback_thread, &bdi->wb,
"flush-%s", dev_name(bdi->dev));
root@szx3:/home/szx# ps -ef|grep flush
root 950 2 0 08:55 ? 00:00:00 [flush-8:0]
bdi_writeback_thread
struct bdi_writeback {
struct backing_dev_info *bdi; /* our parent bdi */
unsigned int nr;
unsigned long last_old_flush; /* last old data flush */
unsigned long last_active; /* last time bdi thread was active */
struct task_struct *task; /* writeback thread */
struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */
struct list_head b_dirty; /* dirty inodes */
struct list_head b_io; /* parked for writeback */
struct list_head b_more_io; /* parked for more writeback */
spinlock_t list_lock; /* protects the b_* lists */
};
struct backing_dev_info {
struct list_head bdi_list;
unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
unsigned long state; /* Always use atomic bitops on this */
unsigned int capabilities; /* Device capabilities */
congested_fn *congested_fn; /* Function pointer if device is md/dm */
void *congested_data; /* Pointer to aux data for congested func */
char *name;
struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
unsigned long bw_time_stamp; /* last time write bw is updated */
unsigned long dirtied_stamp;
unsigned long written_stamp; /* pages written at bw_time_stamp */
unsigned long write_bandwidth; /* the estimated write bandwidth */
unsigned long avg_write_bandwidth; /* further smoothed write bw */
/*
* The base dirty throttle rate, re-calculated on every 200ms.
* All the bdi tasks' dirty rate will be curbed under it.
* @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
* in small steps and is much more smooth/stable than the latter.
*/
unsigned long dirty_ratelimit;
unsigned long balanced_dirty_ratelimit;
struct prop_local_percpu completions;
int dirty_exceeded;
unsigned int min_ratio;
unsigned int max_ratio, max_prop_frac;
struct bdi_writeback wb; /* default writeback info for this bdi */
spinlock_t wb_lock; /* protects work_list */
struct list_head work_list;
struct device *dev;
struct timer_list laptop_mode_wb_timer;
#ifdef CONFIG_DEBUG_FS
struct dentry *debug_dir;
struct dentry *debug_stats;
#endif
};
/*
* Passed into wb_writeback(), essentially a subset of writeback_control
*/
struct wb_writeback_work {
long nr_pages;
struct super_block *sb;
unsigned long *older_than_this;
enum writeback_sync_modes sync_mode;
unsigned int tagged_writepages:1;
unsigned int for_kupdate:1;
unsigned int range_cyclic:1;
unsigned int for_background:1;
enum wb_reason reason; /* why was writeback initiated? */
struct list_head list; /* pending work list */
struct completion *done; /* set if the caller waits */
};
long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
{
struct backing_dev_info *bdi = wb->bdi;
struct wb_writeback_work *work;
long wrote = 0;
set_bit(BDI_writeback_running, &wb->bdi->state);
while ((work = get_next_work_item(bdi)) != NULL) {
/*
* Override sync mode, in case we must wait for completion
* because this thread is exiting now.
*/
if (force_wait)
work->sync_mode = WB_SYNC_ALL;
trace_writeback_exec(bdi, work);
wrote += wb_writeback(wb, work);
/*
* Notify the caller of completion if this is a synchronous
* work item, otherwise just free it.
*/
if (work->done)
complete(work->done);
else
kfree(work);
}
/*
* Check for periodic writeback, kupdated() style
*/
wrote += wb_check_old_data_flush(wb);
wrote += wb_check_background_flush(wb);
clear_bit(BDI_writeback_running, &wb->bdi->state);
return wrote;
}
the first time one of an inode's pages is dirtied, we mark the dirtying-time in the inode's address_space So this periodic writeback code just walks the superblock inode list, writing back any inodes which are older than a specific point in time.
__wait_on_bit
wait_queue_head_t *bit_waitqueue(void *word, int bit)
{
const int shift = BITS_PER_LONG == 32 ? 5 : 6;
const struct zone *zone = page_zone(virt_to_page(word));
unsigned long val = (unsigned long)word << shift | bit;
return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
}
Zone -> wait_table
Zone -> wait_table_bits
Power-of-2 order of the size of the wait queue hash table array
The purpose of all there is to keep track of the people waiting for a page to become available and make them runnable again when possible . The trouble is that this consumes a lot of space ,especially when so few things wait on pages at a given time.
So instead of using per-page wait-queues , we use a waitqueue hash table.
BSF - Bit Scan Forward (386+)
Usage: BSF dest,src
Modifies flags: ZF
Scans source operand for first bit set. Sets ZF if a bit is found set and loads the destination with an index to first set bit. Clears ZF is no bits are found set. BSF scans forward across bit pattern (0-n) while BSR scans in reverse (n-0).
int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
{
int i;
struct pglist_data *pgdat = zone->zone_pgdat;
size_t alloc_size;
/*
* The per-page waitqueue mechanism uses hashed waitqueues
* per zone.
*/
zone->wait_table_hash_nr_entries =
wait_table_hash_nr_entries(zone_size_pages);
zone->wait_table_bits =
wait_table_bits(zone->wait_table_hash_nr_entries);
//....
}
从 低位 开始搜索
static inline unsigned long __ffs(unsigned long word)
{
asm("bsf %1,%0"
: "=r" (word)
: "rm" (word));
return word;
}
*/
static void inode_wait_for_writeback(struct inode *inode,
struct bdi_writeback *wb)
{
DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
wait_queue_head_t *wqh;
wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
while (inode->i_state & I_SYNC) {
spin_unlock(&inode->i_lock);
spin_unlock(&wb->list_lock);
__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
}
}
The kernel can start to synchronize data from various different places , but all paths save one end up in
[sysc_sb_inodes] The function is responsible to synchronize all dirty inodes belonging to a given superblock ,
[writeback_single_inode ] is used for each inode .
static int
writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
struct writeback_control *wbc)
{
struct address_space *mapping = inode->i_mapping;
long nr_to_write = wbc->nr_to_write;
unsigned dirty;
int ret;
assert_spin_locked(&wb->list_lock);
assert_spin_locked(&inode->i_lock);
if (!atomic_read(&inode->i_count))
WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
else
WARN_ON(inode->i_state & I_WILL_FREE);
if (inode->i_state & I_SYNC) {
/*
* If this inode is locked for writeback and we are not doing
* writeback-for-data-integrity, move it to b_more_io so that
* writeback can proceed with the other inodes on s_io.
*
* We'll have another go at writing back this inode when we
* completed a full scan of b_io.
*/
if (wbc->sync_mode != WB_SYNC_ALL) {
requeue_io(inode, wb);
trace_writeback_single_inode_requeue(inode, wbc,
nr_to_write);
return 0;
}
/*
* It's a data-integrity sync. We must wait.
*/
inode_wait_for_writeback(inode, wb);
}
BUG_ON(inode->i_state & I_SYNC);
/* Set I_SYNC, reset I_DIRTY_PAGES */
inode->i_state |= I_SYNC;
inode->i_state &= ~I_DIRTY_PAGES;
spin_unlock(&inode->i_lock);
spin_unlock(&wb->list_lock);
ret = do_writepages(mapping, wbc);
/*
* Make sure to wait on the data before writing out the metadata.
* This is important for filesystems that modify metadata on data
* I/O completion.
*/
if (wbc->sync_mode == WB_SYNC_ALL) {
int err = filemap_fdatawait(mapping);
if (ret == 0)
ret = err;
}
/*
* Some filesystems may redirty the inode during the writeback
* due to delalloc, clear dirty metadata flags right before
* write_inode()
*/
spin_lock(&inode->i_lock);
dirty = inode->i_state & I_DIRTY;
inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
spin_unlock(&inode->i_lock);
/* Don't write the inode if only I_DIRTY_PAGES was set */
if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
int err = write_inode(inode, wbc);
if (ret == 0)
ret = err;
}
spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
inode->i_state &= ~I_SYNC;
if (!(inode->i_state & I_FREEING)) {
/*
* Sync livelock prevention. Each inode is tagged and synced in
* one shot. If still dirty, it will be redirty_tail()'ed below.
* Update the dirty time to prevent enqueue and sync it again.
*/
if ((inode->i_state & I_DIRTY) &&
(wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
inode->dirtied_when = jiffies;
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
/*
* We didn't write back all the pages. nfs_writepages()
* sometimes bales out without doing anything.
*/
inode->i_state |= I_DIRTY_PAGES;
if (wbc->nr_to_write <= 0) {
/*
* slice used up: queue for next turn
*/
requeue_io(inode, wb);
} else {
/*
* Writeback blocked by something other than
* congestion. Delay the inode for some time to
* avoid spinning on the CPU (100% iowait)
* retrying writeback of the dirty page/inode
* that cannot be performed immediately.
*/
redirty_tail(inode, wb);
}
} else if (inode->i_state & I_DIRTY) {
/*
* Filesystems can dirty the inode during writeback
* operations, such as delayed allocation during
* submission or metadata updates after data IO
* completion.
*/
redirty_tail(inode, wb);
} else {
/*
* The inode is clean. At this point we either have
* a reference to the inode or it's on it's way out.
* No need to add it back to the LRU.
*/
list_del_init(&inode->i_wb_list);
}
}
inode_sync_complete(inode);
trace_writeback_single_inode(inode, wbc, nr_to_write);
return ret;
}
相关推荐
在Linux系统中,pdflush是一个后台进程,负责将缓存中的脏页(dirty pages)刷新到磁盘。根据给定部分内容,当满足以下条件之一时,pdflush将被触发: - **脏数据存在的时间超过`dirty_expire_centisecs`**(默认为...
### 漫谈 Linux 文件 I/O #### 一、引言 在 Linux 开发领域,技术人员尤为关注几个关键性能指标:进程管理、CPU 使用率、内存占用、网络 I/O 和磁盘 I/O。其中,磁盘 I/O 性能尤其重要,因为它直接影响到系统的响应...
Linux 系统和性能监控 - 磁盘 I/O 子系统 Linux 系统中的磁盘 I/O 子系统是整个系统中最慢的一部分,主要是由于 CPU 到物理操作磁盘之间的距离所造成的。这使得读取磁盘和内存的时间相比起来,像分钟级到秒级的区别...
### Linux性能监测:Memory #### 一、内存概述与虚拟内存机制 在Linux系统中,内存资源主要包括物理内存(RAM)和虚拟内存。物理内存直接安装在计算机硬件中,而虚拟内存则是通过将一部分硬盘空间作为内存使用的...
内核还会利用pdflush内核线程来管理这一过程,早期版本的Linux有bdfllush和kupdate线程,但在Linux 2.6及以后版本,pdflush线程取代了这两个,它们根据系统负载动态调整数量,以优化性能。 pdflush线程负责扫描页...
### Linux性能监控与优化知识点详解 #### 一、性能监控简介 在《Linux System and Performance Monitoring》这本著作中,作者Darren Hoch深入探讨了Linux系统及其网络性能监控的重要概念和技术。性能监控对于确保...
### Linux系统与性能监控知识点概览 #### 一、性能监控简介 在《Linux系统与性能监控》这一文档中,作者Darren Hoch介绍了Linux环境下如何有效地监控系统的性能。该文档全面地涵盖了网络、输入输出(I/O)、中央...
教学内容与要求 1掌握处理器在进程地址空间上的三种运行位置,了解内核编程不能使用C库函数和FPU,以及可能产生内存故障、核心栈溢出... 13熟悉页cache和radix_tree,缓冲区cache,和pdflush内核线程原理。(2小时)
### Linux系统与性能监控知识点概览 #### 一、性能监控简介 - **定义与目的**:性能监控是指持续地收集、分析系统运行时的数据,以便于了解系统的健康状况及性能表现。对于Linux系统而言,这包括但不限于CPU使用率...
### 精通Linux设备驱动程序开发:内核组件概览 #### 内核线程:并发处理的核心 内核线程是Linux操作系统在内核空间实现后台任务处理的一种方式,类似于用户空间的进程,但拥有更直接的内核访问权限。这种特性使得...
(2)脏数据所占内存 /(MemFree + Cached – Mapped) > dirty_background_ratio。也就是说当脏数据所占用的内存占(MemFree + Cached – Mapped)内存的内存的比例超过dirty_background_ratio的时候会触发pdflush...
Linux 内存管理机制在嵌入式系统,尤其是基于 ARM 技术的设备中扮演着至关重要的角色。与 Windows 的内存管理方式不同,Linux 更倾向于充分利用所有可用的物理内存,而不是仅在需要时分配。这一策略的核心是通过缓存...
【Linux系统监控】是针对Linux操作系统进行性能分析和优化的重要环节,主要关注CPU、内存以及I/O等关键指标。监控这些参数有助于及时发现系统瓶颈,确保系统的稳定运行和高效利用。 **CPU性能监控** 1. **监控参数...
Linux系统原理基础及系统调优是IT领域中的关键技能,涉及到服务器性能的高效运行和问题诊断。本节将深入探讨其中的重要知识点。 1. **PC Server结构与Linux系统体系结构**: - PC Server结构包括硬件组件,如CPU、...
- **`ps -p <Java Process ID> -o rss`**:查询特定 Java 进程的内存使用情况。 - **关键概念** - **虚拟内存**:操作系统利用磁盘空间模拟的内存空间,用于扩展有限的物理内存。 - **kswapd**:负责释放内存...
- **pdflush**:负责将内存中的内容与文件系统进行同步操作,即将写操作返回时数据并没有真正写到磁盘上,而是先写到了系统的cache中,随后由pdflush内核线程将系统中的脏页写到磁盘上。 ##### 工具介绍 - **cat /...