linux proc 文件系统

weird5

已于 2025-09-10 13:44:31 修改

阅读量1.1k

点赞数 24

CC 4.0 BY-SA版权

文章标签： linux

于 2025-09-09 18:07:00 首次发布

本文链接：https://blog.csdn.net/weiw213/article/details/151355766

linux proc 文件系统概述

一引言
二实现
三遗留问题

一引言

proc文件系统是Linux内核中一个非常重要的虚拟文件系统，提供了一个与内核进行交互的接口，通过 proc文件系统，用户和程序可以方便地读取和修改内核数据结构信息，包括系统内核、进程、硬件等。本文将从内核源码角度分析 proc 文件系统的实现原理，包括其初始化、文件的创建与操作、动态内容生成等。
本文分析代码的内核代码版本为linux 2.6.10。源码路径：fs/proc/
至于为什么不分析v6.x的代码，三个原因：第一点：2.6.10版本的代码量小，vscode可以加载全部代码。第二点：实现相对较简单，上手手比较快。第三点：我的目的是分析proc实现的框架，我相信linux的proc大的架构应该变化不大（这个是我主观认为的）。后续会再浏览下高版本的实现。

二实现

2.1 初始化

先来看下初始化过程代码：

void __init proc_root_init(void)
{
	int err = proc_init_inodecache();
	if (err)
		return;
	err = register_filesystem(&proc_fs_type);
	if (err)
		return;
	proc_mnt = kern_mount(&proc_fs_type);
	err = PTR_ERR(proc_mnt);
	if (IS_ERR(proc_mnt)) {
		unregister_filesystem(&proc_fs_type);
		return;
	}
	proc_misc_init();
	proc_net = proc_mkdir("net", NULL);
	proc_net_stat = proc_mkdir("net/stat", NULL);
	proc_root_fs = proc_mkdir("fs", NULL);
	proc_root_driver = proc_mkdir("driver", NULL);
	proc_mkdir("fs/nfsd", NULL);
	proc_tty_init();
	proc_bus = proc_mkdir("bus", NULL);
}

主要流程是创建缓存、文件系统注册、挂载、接着创建根据系统功能进行分类的目录。(好的代码就不用详细看逻辑，从函数字面上就知道该函数的功能及如何实现该功能的)。

2.1.1 proc文件系统注册

proc_init_inodecache 是创建缓存，我们暂时不关心，所以直接来看register_filesystem的源码

int register_filesystem(struct file_system_type * fs)
{
	int res = 0;
	struct file_system_type ** p;

	if (!fs)
		return -EINVAL;
	if (fs->next)
		return -EBUSY;
	INIT_LIST_HEAD(&fs->fs_supers);
	write_lock(&file_systems_lock);
	p = find_filesystem(fs->name);
	if (*p)
		res = -EBUSY;
	else
		*p = fs;
	write_unlock(&file_systems_lock);
	return res;
}

static struct file_system_type **find_filesystem(const char *name)
{
	struct file_system_type **p;
	for (p=&file_systems; *p; p=&(*p)->next)
		if (strcmp((*p)->name,name) == 0)
			break;
	return p;
}

register_filesystem接口主要是在全局的file_systems列表里用name查询是否有同名的被注册了，如果已被注册则返回-EBUSY，否则返回 file_systems 的最后一个为空的节点的指针的地址，然后让该指针指向proc_fs_type 的地址。

、static struct file_system_type proc_fs_type = {
	.name		= "proc",
	.get_sb		= proc_get_sb,
	.kill_sb	= kill_anon_super,
};

注册过后proc_fs_type就可以通过遍历file_systems列表被查询到了。

2.1.2 挂载过程

本节分析 kern_mount 函数。

struct vfsmount *kern_mount(struct file_system_type *type)
{
	return do_kern_mount(type->name, 0, type->name, NULL);
}

在分析该函数前我们先看下此处的挂载和系统调用的mount的实现间什么关系，下面是系统调用mount的实现和kern_mount 间的调用栈。

asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name, char __user * type, unsigned long flags, void __user * data)
	long do_mount(char * dev_name, char * dir_name, char *type_page, unsigned long flags, void *data_page)
		int do_new_mount(struct nameidata *nd, char *type, int flags,int mnt_flags, char *name, void *data)
			struct vfsmount *do_kern_mount(const char *fstype, int flags, const char *name, void *data)

大概看了下mount的实现在kern_mount 执行前都是关于标志位的判断处理、挂载点和设备相关的处理逻辑。也就是说对于系统调用mount新建一个挂载点来说其核心逻辑在 do_new_mount 中，而do_new_mount 中只有do_kern_mount 和 do_add_mount两部分内容。do_kern_mount 与我们本次要分析proc文件系统的挂载的函数kern_mount就是同一函数。

static int do_new_mount(struct nameidata *nd, char *type, int flags,
			int mnt_flags, char *name, void *data)
{
	struct vfsmount *mnt;
	if (!type || !memchr(type, 0, PAGE_SIZE))
		return -EINVAL;
	/* we need capabilities... */
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
	mnt = do_kern_mount(type, flags, name, data);
	if (IS_ERR(mnt))
		return PTR_ERR(mnt);
	return do_add_mount(mnt, nd, mnt_flags, NULL);
}

do_kern_mount 就是我们分析proc挂载时要分析的函数。到此时我们需要了解下面几个结构体：

struct nameidata {
	struct dentry	*dentry;
	struct vfsmount *mnt;
	struct qstr	last;
	unsigned int	flags;
	int		last_type;
	unsigned	depth;
	char *saved_names[MAX_NESTED_LINKS + 1];

	/* Intent data */
	union {
		struct open_intent open;
	} intent;
};
struct vfsmount
{
	struct list_head mnt_hash;
	struct vfsmount *mnt_parent;	/* fs we are mounted on */
	struct dentry *mnt_mountpoint;	/* dentry of mountpoint */
	struct dentry *mnt_root;	/* root of the mounted tree */
	struct super_block *mnt_sb;	/* pointer to superblock */
	struct list_head mnt_mounts;	/* list of children, anchored here */
	struct list_head mnt_child;	/* and going through their mnt_child */
	atomic_t mnt_count;
	int mnt_flags;
	int mnt_expiry_mark;		/* true if marked for expiry */
	char *mnt_devname;		/* Name of device e.g. /dev/dsk/hda1 */
	struct list_head mnt_list;
	struct list_head mnt_fslink;	/* link in fs-specific expiry list */
	struct namespace *mnt_namespace; /* containing namespace */
};

struct dentry {
	atomic_t d_count;
	unsigned int d_flags;		/* protected by d_lock */
	spinlock_t d_lock;		/* per dentry lock */
	struct inode *d_inode;		/* Where the name belongs to - NULL is
					 * negative */
	/*
	 * The next three fields are touched by __d_lookup.  Place them here
	 * so they all fit in a 16-byte range, with 16-byte alignment.
	 */
	struct dentry *d_parent;	/* parent directory */
	struct qstr d_name;

	struct list_head d_lru;		/* LRU list */
	struct list_head d_child;	/* child of parent list */
	struct list_head d_subdirs;	/* our children */
	struct list_head d_alias;	/* inode alias list */
	unsigned long d_time;		/* used by d_revalidate */
	struct dentry_operations *d_op;
	struct super_block *d_sb;	/* The root of the dentry tree */
	void *d_fsdata;			/* fs-specific data */
 	struct rcu_head d_rcu;
	struct dcookie_struct *d_cookie; /* cookie, if any */
	struct hlist_node d_hash;	/* lookup hash list */	
	int d_mounted;
	unsigned char d_iname[DNAME_INLINE_LEN_MIN];	/* small names */
};

dentry 是树状目录结构中的一个节点在内存中的抽象。也就是说一个路径肯定是对应一个dentry对象的，并且该对象只存在于内存中，不存在于磁盘上。nameidata 是路径查询过程中的辅助数据结构，该结构里即有dentry信息，同时也包含vfsmount信息。vfsmount是被访问路径对应的挂载点的信息。他们间更详细的关系不在本文的讨论范畴。

do_kern_mount根据文件系统类型（如 ext4、tmpfs）、设备路径、挂载选项等参数，加载对应的文件系统驱动，初始化文件系统的核心数据结构（如 vfsmount、super_block），最终返回一个 “已就绪但尚未关联到挂载点” 的文件系统实例。做好挂载前的所有准备工作，最后返回vfsmount对象。
do_add_mount：负责完成挂载点的关联与注册—— 它接收 do_kern_mount 准备好的文件系统实例，以及用户指定的 “挂载点路径”，然后做一系列校验（如挂载点是否存在、是否已被挂载、权限是否足够），最终将 “文件系统实例” 与 “挂载点路径” 绑定，并把这个挂载关系注册到内核的挂载管理链表中。至此，用户空间才能通过挂载点访问文件系统。

2.1.3 其他杂项的初始化

该函数代码如下：

void __init proc_misc_init(void)
{
	struct proc_dir_entry *entry;
	static struct {
		char *name;
		int (*read_proc)(char*,char**,off_t,int,int*,void*);
	} *p, simple_ones[] = {
		{"loadavg",     loadavg_read_proc},
		{"uptime",	uptime_read_proc},
		{"meminfo",	meminfo_read_proc},
		{"version",	version_read_proc},

		{"devices",	devices_read_proc},
		{"filesystems",	filesystems_read_proc},
		{"cmdline",	cmdline_read_proc},
		{"locks",	locks_read_proc},
		{"execdomains",	execdomains_read_proc},
		{NULL,}
	};
	for (p = simple_ones; p->name; p++)
		create_proc_read_entry(p->name, 0, NULL, p->read_proc, NULL);

	proc_symlink("mounts", NULL, "self/mounts");

	/* And now for trickier ones */
	entry = create_proc_entry("kmsg", S_IRUSR, &proc_root);
	if (entry)
		entry->proc_fops = &proc_kmsg_operations;
	create_seq_entry("cpuinfo", 0, &proc_cpuinfo_operations);
	create_seq_entry("partitions", 0, &proc_partitions_operations);
	create_seq_entry("stat", 0, &proc_stat_operations);
	create_seq_entry("interrupts", 0, &proc_interrupts_operations);
	create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations);
	create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations);
	create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations);
	create_seq_entry("diskstats", 0, &proc_diskstats_operations);
}

我们以simple_ones数组中一个成员(cmdline)为例来分析create_proc_read_entry函数。此时第一个参数为指向字符串"cmdline"的指针，第四个参数为注册的回调函数cmdline_read_proc，其他参数为0

static inline struct proc_dir_entry *create_proc_read_entry(const char *name, mode_t mode, struct proc_dir_entry *base, read_proc_t *read_proc, void * data)
{
	struct proc_dir_entry *res=create_proc_entry(name,mode,base);
	if (res) {
		res->read_proc=read_proc;
		res->data=data;
	}
	return res;
}

核心逻辑在create_proc_entry函数：

struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, struct proc_dir_entry *parent)
{
	struct proc_dir_entry *ent;
	nlink_t nlink;
    ...
	ent = proc_create(&parent,name,mode,nlink);
	if (ent) {
	    ...
		if (proc_register(parent, ent) < 0) {
			kfree(ent);
			ent = NULL;
		}
	}
	return ent;
}

proc_create 会分配一个sizeofof(struct proc_dir_entry)+strlen(“cmdline”)大小空间并初始化该结构体的成员。尾部空间用于保存字符串"cmdline"。同时将parent指针指向proc_root 。

struct proc_dir_entry proc_root = {
	.low_ino	= PROC_ROOT_INO, 
	.namelen	= 5, 
	.name		= "/proc",
	.mode		= S_IFDIR | S_IRUGO | S_IXUGO, 
	.nlink		= 2, 
	.proc_iops	= &proc_root_inode_operations, 
	.proc_fops	= &proc_root_operations,
	.parent		= &proc_root,
};

proc_register 函数代码如下：

static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
{
	unsigned int i;
	
	i = get_inode_number();
	if (i == 0)
		return -EAGAIN;
	dp->low_ino = i;
	dp->next = dir->subdir;
	dp->parent = dir;
	dir->subdir = dp;
	if (S_ISDIR(dp->mode)) {
        ...
	} else if (S_ISLNK(dp->mode)) {
		...
	} else if (S_ISREG(dp->mode)) {
		if (dp->proc_fops == NULL)
			dp->proc_fops = &proc_file_operations;
		if (dp->proc_iops == NULL)
			dp->proc_iops = &proc_file_inode_operations;
	}
	return 0;
}

proc_register 函数的主要功能就是把新创建的proc_dir_entry 对象挂在数上并且挂了操作该文件时的回调函数 proc_file_operations。这棵树上的每个节点类型如下：

struct proc_dir_entry {
	unsigned int low_ino;
	unsigned short namelen;
	const char *name;
	mode_t mode;
	nlink_t nlink;
	uid_t uid;
	gid_t gid;
	unsigned long size;
	struct inode_operations * proc_iops;
	struct file_operations * proc_fops;
	get_info_t *get_info;
	struct module *owner;
	struct proc_dir_entry *next, *parent, *subdir;//next 类似sibling节点，parent 是父节点，subdir 是子child节点
	void *data;
	read_proc_t *read_proc;
	write_proc_t *write_proc;
	atomic_t count;		/* use count */
	int deleted;		/* delete flag */
};

这颗树其实就是一个二叉链（参见https://blog.csdn.net/weiw213/article/details/150600041?spm=1011.2124.3001.6209），上面数据结构中的next等同于二叉链数据结构中的sibling指针，parent是相同的成员，subdir就是child指针。
proc_register 函数中还有个get_inode_number函数值得说下：

static unsigned int get_inode_number(void)
{
	int i, inum = 0;
	int error;
retry:
	if (idr_pre_get(&proc_inum_idr, GFP_KERNEL) == 0)
		return 0;
	spin_lock(&proc_inum_lock);
	error = idr_get_new(&proc_inum_idr, NULL, &i);
	spin_unlock(&proc_inum_lock);
	if (error == -EAGAIN)
		goto retry;
	else if (error)
		return 0;
	inum = (i & MAX_ID_MASK) + PROC_DYNAMIC_FIRST;

	return inum;
}

该函数是从idr上申请一个未使用的位置，将该位置编号作为proc_dir_entry节点的inode编号，idr的实现和用法可以单独写一篇文章来介绍。
在create_proc_entry接口正常创建一个proc_dir_entry 对象后，会把cmdline_read_proc最为回调函数赋值给proc_dir_entry 的read_proc成员。

static int cmdline_read_proc(char *page, char **start, off_t off,
				 int count, int *eof, void *data)
{
	int len;

	len = sprintf(page, "%s\n", saved_command_line);
	return proc_calc_metrics(page, start, off, count, eof, len);
}

这样在用户读取/proc/cmdline时调用该回调函数cmdline_read_proc，来实现把全局的saved_command_line的内容返回到用户态。

2.2 用户态使用

本节从用户态使用的角度来分析用户是如果通过open和read系统调用来把/proc/cmdline的内容读取到的。理论上本节内容应该可以回答第三节的遗留问题。

2.2.1 open系统调用

先来看下该系统调用的实现代码：

asmlinkage long sys_open(const char __user * filename, int flags, int mode)
{
	char * tmp;
	int fd, error;

	tmp = getname(filename);
	fd = PTR_ERR(tmp);
	if (!IS_ERR(tmp)) {
		fd = get_unused_fd();
		if (fd >= 0) {
			struct file *f = filp_open(tmp, flags, mode);
			error = PTR_ERR(f);
			if (IS_ERR(f))
				goto out_error;
			fd_install(fd, f);
		}
out:
		putname(tmp);
	}
	return fd;

out_error:
	put_unused_fd(fd);
	fd = error;
	goto out;
}

该函数先是通过getname函数把用户态数据拷贝到内核cache上分配的空间上，然后通过get_unused_fd在进程的数据结构中找到文件描述符的空位置，接着filp_open函数把要打开的文件打开返回struct file 类型对象来描述被打开的文件，最后把struct file对象放到进程管理打开文件的struct files_struct类型的数据结构中，从而实现fd和struct file对象的一一映射。

2.2.2 read系统调用

三遗留问题

2.1.2节有描述mount系统调用在新建一个挂载时需要两步第一步是根据参数准备vfsmount，第二步是将挂载关系注册到内核的挂载管理链表中，以便用户态程序可以通过挂载点来访问里面的文件。但是分析proc的挂载过程貌似只有准备vfsmount的过程，没有看到第二步，用户是如何访问对应的文件的呢？是否与proc_mnt是全局变量有关呢？还是访问方式不同，后续再继续分析。