sysprog 16
TRANSCRIPT
- 1. C/C++ Linux System Programming
-
- Session 16
-
-
- User-space System Programming
-
- session 6
2. Outline
- Filesystem concepts
- File I/O Ops
3. Filesystem
- Traditionally: An abstraction for storage device access
- Why?
-
- Common sensible organization
-
- Encapsulate OS HW interaction, e.g. performance considerations
4. VFS
- Wider-range abstraction:
-
- special FS, different types of disk FS, network FS
-
- Common user interface
-
- Multiple FS's
-
- Common handling
5. Mounts
- Superblocks filesystem control block
- Mount point
- Syscalls
-
- int mount(const char *source, const char *target, const char *filesystemtype, unsigned long mountflags, const void *data);
-
- int umount(const char *target);
6. FS Objects and Metadata
- Inode file control block
-
- A unique ID
-
- Access/Owner info
-
- Memory maps
-
- Block device info
- Dirent file as a directory entry (not physical)
- File file data and hook to meta (not physical)
7. Journaling
- Problem:
-
- operations on metadata are non-atomic, can be interrupted by power loss
- Physical vs logical journals
- Metadata-only journals
8. Disk Cache
- Buffers
- Page cache
- Writeback pdflush
- Read-ahead
9. File Descriptors
- Descriptors index into process file table
- int open(const char *pathname, int flags);
- int open(const char *pathname, int flags, mode_t mode);
- int creat(const char *pathname, mode_t mode);
-
- Open with O_CREAT (disk files only)
- int close(int fd); /* notice status !! */
10. File I/O modes
- int fcntl(int fd, int cmd, long arg); // F_SETFL
- Nonblocking: If not ready, EAGAIN - O_NONBLOCK
- Synchronized: Wait until data is on HW - O_SYNC
-
- int fsync(int fd);
- Asynchronous: Signal when ready - O_ASYNC
-
- SIGIO handler
-
- fcntl: F_GETSIG / F_SETSIG, F_SETOWN/F_GETOWN (process getting signal)
- Direct: Directly from user buffer - O_DIRECT
11. More File control
- int unlink(const char *pathname);
- int truncate(const char *path, off_t length);
-
- int ftruncate(int fd, off_t length);
-
- O_TRUNC on open
12. Descriptor I/O
- ssize_t read(int fd, void *buf, size_t count);
- ssize_t write(int fd, const void *buf, size_t count);
- off_t lseek(int fd, off_t offset, int whence);
-
- SEEK_SET, SEEK_CUR, SEEK_END
- EOF
13. IO Vectors
- ssize_t readv(int fd, const struct iovec *iov, int iovcnt);
- ssize_t writev(int fd, const struct iovec *iov, int iovcnt);
struct iovec { void*iov_base;/* Starting address */ size_t iov_len;/* Number of bytes to transfer */ }; 14. int echo_main(int argc, char **argv) { struct iovec io[argc]; struct iovec *cur_io = io; char *arg; char *p; ... while (1) { int c; cur_io->iov_base = p = arg; ... while ((c = *arg++)) { if (c == eflag) { /* Check for escape seq. */ if (*arg == 'c') { /* 'c' means cancel newline and ignore all subsequent chars. */ cur_io->iov_len = p - (char*)cur_io->iov_base; cur_io++; goto ret; } ... c = bb_process_escape_sequence( (void*) &arg); } *p++ = c; } arg = *++argv; if (arg) *p++ = ' '; cur_io->iov_len = p - (char*)cur_io->iov_base; cur_io++; if (!arg) break; } ret: return writev(1, io, (cur_io - io)) >= 0; } 15. Memory Mapped file
- void *mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
- int munmap(void *start, size_t length);
- Important flags:
-
- No anonymous, MAP_SHARED, MAP_FIXED, MAP_POPULATE ( | MAP_NONBLOCK)
- int msync(void *start, size_t length, int flags); // MS_SYNC or MS_ASYNC
- void *mremap(void *old_address, size_t old_size, size_t new_size, int flags);
16. Locking
- Mandatory Locking (BSD)
-
- ~S_IXGRP | SGID ( + mount flag MS_MANDLOCK)
-
- Racy (mmap)
- Advisory Locking
-
- Both sides play nice
17. Advisory Locking
- int flock(int fd, int operation); // LOCK_SH, LOCK_EX, LOCK_UN
- int lockf(int fd, int cmd, off_t len); // F_LOCK, T_LOCK, F_ULOCK, F_TEST
- fcntl: F_GETLK, F_SETLK, F_SETLKW
-
- High level of control (with offset, down to a single byte)
struct flock { ... short l_type;/* Type of lock: F_RDLCK, F_WRLCK, F_UNLCK */ short l_whence;/* How to interpret l_start: SEEK_SET, SEEK_CUR, SEEK_END */ off_t l_start;/* Starting offset for lock */ off_t l_len;/* Number of bytes to lock */ pid_t l_pid;/* PID of process blocking our lock (F_GETLK only) */ ... }; 18. #ifdef F_SETLK #ifndef SEEK_SET #define SEEK_SET 0 #endif struct flock lock_data; lock_data.l_type = F_WRLCK; lock_data.l_whence = SEEK_SET; lock_data.l_start = lock_data.l_len = 0; if (fcntl(pidFd, F_SETLK, &lock_data) == -1) { if (errno == EAGAIN) return oldpid; else return -1; } #else #ifdef LOCK_EX if (flock (pidFd, LOCK_EX|LOCK_NB) == -1) { if (errno == EWOULDBLOCK) return oldpid; else return -1; } #else if (lockf (pidFd, F_TLOCK, 0) == -1) { if (errno == EACCES) return oldpid; else return -1; } #endif #endif } 19. Buffered I/O
- Streams:Buffer I/O and write to kernel at once
-
- Better alignment
-
- Less system calls
-
- Yet another cache!!
-
- FILE *
-
- Formatting
- FILE *fopen(const char *path, const char *mode);
- FILE *fdopen(int fd, const char *mode);
- int fclose(FILE *fp);
- int fileno(FILE *stream);
20. I/O
- size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream);
- size_t fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
- Formatted
-
- int fprintf(FILE *stream, const char *format, ...);
-
- int fscanf(FILE *stream, const char *format, ...);
- Char
-
- int fputc(int c, FILE *stream);
-
- int fgetc(FILE *stream);-- int ungetc(int c, FILE *stream);
- String
-
- int fputs(const char *s, FILE *stream);
-
- char *fgets(char *s, int size, FILE *stream);
21. Behind the Scenes
- Inherently thread-safe
- To do your own locking (of the stream, not the file)
-
- void flockfile(FILE *filehandle);
-
- int ftrylockfile(FILE *filehandle);
-
- void funlockfile(FILE *filehandle);
-
- xxx_unlocked versions (e.g. fread_unlocked)
- Flushing the stream (not the page cache)
-
- int fflush(FILE *stream);
22. Errors
- int feof(FILE *stream);
- int ferror(FILE *stream);
- void clearerr(FILE *stream);
- Descriptor ops can not distinguish EOF vs error
23. Positioning
- int fseek(FILE *stream, long offset, int whence);
- long ftell(FILE *stream);
- int fgetpos(FILE *stream, fpos_t *pos);
- int fsetpos(FILE *stream, fpos_t *pos);
24. Metadata
- int fstat(int fd, struct stat *buf);
- int stat(const char *path, struct stat *buf);
-
- lstat : BSD only
-
- Exec on all nodes in path
struct stat { dev_tst_dev;/* ID of device containing file */ ino_tst_ino;/* inode number */ mode_tst_mode;/* protection */ nlink_tst_nlink;/* number of hard links */ uid_tst_uid;/* user ID of owner */ gid_tst_gid;/* group ID of owner */ dev_tst_rdev;/* device ID (if special file) */ off_tst_size;/* total size, in bytes */ blksize_t st_blksize; /* blocksize for filesystem I/O */ blkcnt_tst_blocks;/* number of blocks allocated */ time_tst_atime;/* time of last access */ time_tst_mtime;/* time of last modification */ time_tst_ctime;/* time of last status change */ }; 25. Directory Streams
- A directory is a file whose entries are other inodes
- DIR *opendir(const char *name);
- int closedir(DIR *dir);
- struct dirent *readdir(DIR *dir);
struct dirent { ino_td_ino;/* inode number */ off_td_off;/* offset to the next dirent */ unsigned short d_reclen;/* length of this record */ unsigned chard_type;/* type of file */ chard_name[256]; /* filename */ }; 26. static pid_list *scan_proc_pids(inode_list *ilist) { DIR *d; struct dirent *de; pid_t pid; pid_list *plist; xchdir("/proc"); d = opendir("/proc"); if (!d) return NULL; plist = NULL; while ((de = readdir(d)) != NULL) { pid = (pid_t)bb_strtou(de->d_name, NULL, 10); if (errno) continue; if (chdir(de->d_name) < 0) continue; plist = scan_link("cwd", pid, ilist, plist); plist = scan_link("exe", pid, ilist, plist); plist = scan_link("root", pid, ilist, plist); .... } closedir(d); return plist; } static pid_list *scan_link(const char *lname, pid_t pid, inode_list *ilist, pid_list *plist) { ino_t inode; dev_t dev; if (!file_to_dev_inode(lname, &dev, &inode)) return plist; if (search_dev_inode(ilist, dev, inode)) plist = add_pid(plist, pid); return plist; } static int file_to_dev_inode(const char *filename, dev_t *dev, ino_t *inode) { struct stat f_stat; if (stat(filename, &f_stat)) return 0; *inode = f_stat.st_ino; *dev = f_stat.st_dev; return 1; } static int search_dev_inode(inode_list *ilist, dev_t dev, ino_t inode) { while (ilist) { if (ilist->dev == dev) { if (option_mask32 & OPT_MOUNT) return 1; if (ilist->inode == inode) return 1; } ilist = ilist->next; } return 0; } 27. I/O Multiplexing
- int select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout);
- int pselect(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timespec *timeout, const sigset_t *sigmask);
- int poll(struct pollfd *fds, nfds_t nfds, int timeout);
- int ppoll(struct pollfd *fds, nfds_vt nfds, const struct timespec *timeout, const sigset_t *sigmask);
-
- POLLIN/POLLOUT/POLLPRI/POLLERR
void FD_CLR(int fd, fd_set *set); intFD_ISSET(int fd, fd_set *set); void FD_SET(int fd, fd_set *set); void FD_ZERO(fd_set *set); struct pollfd { intfd;/* file descriptor */ short events;/* requested events */ short revents;/* returned events */ }; 28. Epoll
- Decouple interest set registration from poll
-
- +: O(1) on the wait
-
- +: Edge trigger
-
- - : system call for adding onto the set
- int epoll_create(int size); //desc, need close
- int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
- int epoll_wait(int epfd, struct epoll_event *events, int maxevents, int timeout);
typedef union epoll_data { void*ptr; intfd; uint32_t u32; uint64_t u64; } epoll_data_t; struct epoll_event { uint32_tevents;/* Epoll events */ epoll_data_t data;/* User data variable */ }; 29. 30. IOCTL
- Device / special file control
- int ioctl(int d, int request, ...);
- Request is specific to device being controlled, and may have a payload (ioctl_list)
31. Filesystem events
- int inotify_init(void); // desc, need close
- int inotify_add_watch(int fd, const char *pathname, uint32_t mask); // watch desc
- int inotify_rm_watch(int fd, uint32_t wd);
- FIONREAD ioctl
- fcntl: F_NOTIFY
struct inotify_event { int wd;/* watch descriptor */ uint32_t mask;/* mask of events */ uint32_t cookie; /* unique cookie */ uint32_t len;/* size of 'name' field */ char name[];/* null-terminated name */ }; 32. int inotifyd_main(int argc UNUSED_PARAM, char **argv) { unsigned mask = IN_ALL_EVENTS; // assume we want all events struct pollfd pfd; char **watched = ++argv; // watched name list const char *args[] = { *argv, NULL, NULL, NULL, NULL }; // open inotify pfd.fd = inotify_init(); if (pfd.fd < 0) bb_perror_msg_and_die("no kernel support"); // setup watched while (*++argv) { char *path = *argv; char *masks = strchr(path, ':'); int wd; // watch descriptor // if mask is specified -> if (masks) { *masks = ''; // split path and mask // convert mask names to mask bitset mask = 0; while (*++masks) { int i = strchr(mask_names, *masks) - mask_names; if (i >= 0) { mask |= (1 0) { ssize_t len; void *buf; struct inotify_event *ie; // read out all pending events xioctl(pfd.fd, FIONREAD, &len); #define eventbuf bb_common_bufsiz1 ie = buf = (len 0) { int i; char events[12]; char *s = events; unsigned m = ie->mask; for (i = 0; i < 12; ++i, m >>= 1) { if (m & 1) { *s++ = mask_names[i]; } } *s = ''; args[1] = events; args[2] = watched[ie->wd]; args[3] = ie->len ? ie->name : NULL; xspawn((char **)args); // next event i = sizeof(struct inotify_event) + ie->len; len -= i; ie = (void*)((char*)ie + i); } if (eventbuf != buf) free(buf); } return EXIT_SUCCESS; } 33. Asynchronous I/O
- Only on O_DIRECT
struct aiocb { int aio_filedes;/* file descriptor * int aio_lio_opcode;/* operation to perform */ int aio_reqprio;/* request priority offset * volatile void *aio_buf;/* pointer to buffer */ size_t aio_nbytes;/* length of operation */ struct sigevent aio_sigevent; /* signal number and value */ /* internal, private members follow... */ }; int aio_read (struct aiocb *aiocbp); int aio_write (struct aiocb *aiocbp); int aio_error (const struct aiocb *aiocbp); int aio_return (struct aiocb *aiocbp); int aio_cancel (int fd, struct aiocb *aiocbp); int aio_fsync (int op, struct aiocb *aiocbp); int aio_suspend (const struct aiocb * const cblist[], int n, const struct timespec *timeout);