diff --git a/fs/Kconfig b/fs/Kconfig index fc32266ed351..3914d839742d 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -251,6 +251,7 @@ source "fs/pstore/Kconfig" source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/exofs/Kconfig" +source "fs/erofs/Kconfig" endif # MISC_FILESYSTEMS diff --git a/fs/Makefile b/fs/Makefile index 904279ceaf94..fb8ab653f506 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -136,3 +136,4 @@ obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ obj-$(CONFIG_EFIVAR_FS) += efivarfs/ obj-$(CONFIG_DYNAMIC_FSYNC) += dyn_sync_cntrl.o +obj-$(CONFIG_EROFS_FS) += erofs/ diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig new file mode 100644 index 000000000000..6fe179ebbd53 --- /dev/null +++ b/fs/erofs/Kconfig @@ -0,0 +1,158 @@ +# SPDX-License-Identifier: GPL-2.0 + +config EROFS_FS + tristate "EROFS filesystem support" + depends on BLOCK + help + EROFS(Enhanced Read-Only File System) is a lightweight + read-only file system with modern designs (eg. page-sized + blocks, inline xattrs/data, etc.) for scenarios which need + high-performance read-only requirements, eg. firmwares in + mobile phone or LIVECDs. + + It also provides VLE compression support, focusing on + random read improvements, keeping relatively lower + compression ratios, which is useful for high-performance + devices with limited memory and ROM space. + + If unsure, say N. + +config EROFS_FS_DEBUG + bool "EROFS debugging feature" + depends on EROFS_FS + help + Print EROFS debugging messages and enable more BUG_ONs + which check the filesystem consistency aggressively. + + For daily use, say N. + +config EROFS_FS_XATTR + bool "EROFS extended attributes" + depends on EROFS_FS + default y + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + for details). + + If unsure, say N. + +config EROFS_FS_POSIX_ACL + bool "EROFS Access Control Lists" + depends on EROFS_FS_XATTR + select FS_POSIX_ACL + default y + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N. + +config EROFS_FS_SECURITY + bool "EROFS Security Labels" + depends on EROFS_FS_XATTR + help + Security labels provide an access control facility to support Linux + Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO + Linux. This option enables an extended attribute handler for file + security labels in the erofs filesystem, so that it requires enabling + the extended attribute support in advance. + + If you are not using a security module, say N. + +config EROFS_FS_USE_VM_MAP_RAM + bool "EROFS VM_MAP_RAM Support" + depends on EROFS_FS + help + use vm_map_ram/vm_unmap_ram instead of vmap/vunmap. + + If you don't know what these are, say N. + +config EROFS_FAULT_INJECTION + bool "EROFS fault injection facility" + depends on EROFS_FS + help + Test EROFS to inject faults such as ENOMEM, EIO, and so on. + If unsure, say N. + +config EROFS_FS_IO_MAX_RETRIES + int "EROFS IO Maximum Retries" + depends on EROFS_FS + default "5" + help + Maximum retry count of IO Errors. + + If unsure, leave the default value (5 retries, 6 IOs at most). + +config EROFS_FS_ZIP + bool "EROFS Data Compresssion Support" + depends on EROFS_FS + help + Currently we support VLE Compression only. + Play at your own risk. + + If you don't want to use compression feature, say N. + +config EROFS_FS_CLUSTER_PAGE_LIMIT + int "EROFS Cluster Pages Hard Limit" + depends on EROFS_FS_ZIP + range 1 256 + default "1" + help + Indicates VLE compressed pages hard limit of a + compressed cluster. + + For example, if files of a image are compressed + into 8k-unit, the hard limit should not be less + than 2. Otherwise, the image cannot be mounted + correctly on this kernel. + +choice + prompt "EROFS VLE Data Decompression mode" + depends on EROFS_FS_ZIP + default EROFS_FS_ZIP_CACHE_BIPOLAR + help + EROFS supports three options for VLE decompression. + "In-place Decompression Only" consumes the minimum memory + with lowest random read. + + "Bipolar Cached Decompression" consumes the maximum memory + with highest random read. + + If unsure, select "Bipolar Cached Decompression" + +config EROFS_FS_ZIP_NO_CACHE + bool "In-place Decompression Only" + help + Read compressed data into page cache and do in-place + decompression directly. + +config EROFS_FS_ZIP_CACHE_UNIPOLAR + bool "Unipolar Cached Decompression" + help + For each request, it caches the last compressed page + for further reading. + It still decompresses in place for the rest compressed pages. + +config EROFS_FS_ZIP_CACHE_BIPOLAR + bool "Bipolar Cached Decompression" + help + For each request, it caches the both end compressed pages + for further reading. + It still decompresses in place for the rest compressed pages. + + Recommended for performance priority. + +endchoice + +config EROFS_FS_HUAWEI_EXTENSION + bool "EROFS HUAWEI Extension" + depends on EROFS_FS && EROFS_FS_ZIP + help + Targeted optimizations for HUAWEI Mobile phone. + + If unsure, say N. + diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile new file mode 100644 index 000000000000..9fa29ceec6fb --- /dev/null +++ b/fs/erofs/Makefile @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: GPL-2.0 + +EROFS_VERSION = "0.5.180822.0" + +ccflags-y += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\" + +obj-$(CONFIG_EROFS_FS) += erofs.o +# staging requirement: to be self-contained in its own directory +ccflags-y += -I$(src)/include +erofs-objs := super.o inode.o data.o namei.o dir.o utils.o +erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o +erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o unzip_vle_lz4.o + +# lz4 algorithm related stuffs +erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_lz4.o +erofs-$(CONFIG_ARM64) += $(addprefix lz4armv8/, \ + lz4armv8.o \ + lz4accel.o \ + ) +CFLAGS_unzip_lz4.o += -O3 + diff --git a/fs/erofs/data.c b/fs/erofs/data.c new file mode 100644 index 000000000000..314fdb6ffb98 --- /dev/null +++ b/fs/erofs/data.c @@ -0,0 +1,419 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/drivers/staging/erofs/data.c + * + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ +#include "internal.h" +#include + +#include + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0)) +static inline void read_endio(struct bio *bio, int err) +#else +static inline void read_endio(struct bio *bio) +#endif +{ + int i; + struct bio_vec *bvec; +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0)) + const int err = bio->bi_status; +#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)) + const int err = bio->bi_error; +#endif + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + + /* page is already locked */ + BUG_ON(PageUptodate(page)); + + if (unlikely(err)) + SetPageError(page); + else + SetPageUptodate(page); + + unlock_page(page); + /* page could be reclaimed now */ + } + bio_put(bio); +} + +/* prio -- true is used for dir */ +struct page *__erofs_get_meta_page(struct super_block *sb, + erofs_blk_t blkaddr, bool prio, bool nofail) +{ + struct inode *const bd_inode = sb->s_bdev->bd_inode; + struct address_space *const mapping = bd_inode->i_mapping; + /* prefer retrying in the allocator to blindly looping below */ + const gfp_t gfp = mapping_gfp_constraint(mapping, ~__GFP_FS) | + (nofail ? __GFP_NOFAIL : 0); + unsigned int io_retries = nofail ? EROFS_IO_MAX_RETRIES_NOFAIL : 0; + struct page *page; + int err; + +repeat: + page = find_or_create_page(mapping, blkaddr, gfp); + if (unlikely(page == NULL)) { + DBG_BUGON(nofail); + return ERR_PTR(-ENOMEM); + } + DBG_BUGON(!PageLocked(page)); + + if (!PageUptodate(page)) { + struct bio *bio; + + bio = erofs_grab_bio(sb, blkaddr, 1, read_endio, nofail); + if (IS_ERR(bio)) { + DBG_BUGON(nofail); + err = PTR_ERR(bio); + goto err_out; + } + + err = bio_add_page(bio, page, PAGE_SIZE, 0); + if (unlikely(err != PAGE_SIZE)) { + err = -EFAULT; + goto err_out; + } + + __submit_bio(bio, REQ_OP_READ, + REQ_META | (prio ? REQ_PRIO : 0)); + + lock_page(page); + + /* this page has been truncated by others */ + if (unlikely(page->mapping != mapping)) { +unlock_repeat: + unlock_page(page); + put_page(page); + goto repeat; + } + + /* more likely a read error */ + if (unlikely(!PageUptodate(page))) { + if (io_retries) { + --io_retries; + goto unlock_repeat; + } + err = -EIO; + goto err_out; + } + } + return page; + +err_out: + unlock_page(page); + put_page(page); + return ERR_PTR(err); +} + +static int erofs_map_blocks_flatmode(struct inode *inode, + struct erofs_map_blocks *map, + int flags) +{ + erofs_blk_t nblocks, lastblk; + u64 offset = map->m_la; + struct erofs_vnode *vi = EROFS_V(inode); + + trace_erofs_map_blocks_flatmode_enter(inode, map, flags); + BUG_ON(is_inode_layout_compression(inode)); + + nblocks = DIV_ROUND_UP(inode->i_size, PAGE_SIZE); + lastblk = nblocks - is_inode_layout_inline(inode); + + if (unlikely(offset >= inode->i_size)) { + /* leave out-of-bound access unmapped */ + map->m_flags = 0; + map->m_plen = 0; + goto out; + } + + /* there is no hole in flatmode */ + map->m_flags = EROFS_MAP_MAPPED; + + if (offset < blknr_to_addr(lastblk)) { + map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la; + map->m_plen = blknr_to_addr(lastblk) - offset; + } else if (is_inode_layout_inline(inode)) { + /* 2 - inode inline B: inode, [xattrs], inline last blk... */ + struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); + + map->m_pa = iloc(sbi, vi->nid) + vi->inode_isize + + vi->xattr_isize + erofs_blkoff(map->m_la); + map->m_plen = inode->i_size - offset; + + /* inline data should locate in one meta block */ + BUG_ON(erofs_blkoff(map->m_pa) + map->m_plen > PAGE_SIZE); + map->m_flags |= EROFS_MAP_META; + } else { + errln("internal error @ nid: %llu (size %llu), m_la 0x%llx", + vi->nid, inode->i_size, map->m_la); + BUG(); + } + +out: + map->m_llen = map->m_plen; + trace_erofs_map_blocks_flatmode_exit(inode, map, flags, 0); + return 0; +} + +#ifdef CONFIG_EROFS_FS_ZIP +extern int z_erofs_map_blocks_iter(struct inode *, + struct erofs_map_blocks *, struct page **, int); +#endif + +int erofs_map_blocks_iter(struct inode *inode, + struct erofs_map_blocks *map, + struct page **mpage_ret, int flags) +{ + /* by default, reading raw data never use erofs_map_blocks_iter */ + if (unlikely(!is_inode_layout_compression(inode))) { + if (*mpage_ret != NULL) + put_page(*mpage_ret); + *mpage_ret = NULL; + + return erofs_map_blocks(inode, map, flags); + } + +#ifdef CONFIG_EROFS_FS_ZIP + return z_erofs_map_blocks_iter(inode, map, mpage_ret, flags); +#else + /* data compression is not available */ + return -ENOTSUPP; +#endif +} + +int erofs_map_blocks(struct inode *inode, + struct erofs_map_blocks *map, int flags) +{ + if (unlikely(is_inode_layout_compression(inode))) { + struct page *mpage = NULL; + int err; + + err = erofs_map_blocks_iter(inode, map, &mpage, flags); + if (mpage != NULL) + put_page(mpage); + return err; + } + return erofs_map_blocks_flatmode(inode, map, flags); +} + +static inline struct bio *erofs_read_raw_page( + struct bio *bio, + struct address_space *mapping, + struct page *page, + erofs_off_t *last_block, + unsigned nblocks, + bool ra) +{ + struct inode *inode = mapping->host; + erofs_off_t current_block = (erofs_off_t)page->index; + int err; + + BUG_ON(!nblocks); + + if (PageUptodate(page)) { + err = 0; + goto has_updated; + } + + if (cleancache_get_page(page) == 0) { + err = 0; + SetPageUptodate(page); + goto has_updated; + } + + /* note that for readpage case, bio also equals to NULL */ + if (bio != NULL && + /* not continuous */ + *last_block + 1 != current_block) { +submit_bio_retry: + __submit_bio(bio, REQ_OP_READ, 0); + bio = NULL; + } + + if (bio == NULL) { + struct erofs_map_blocks map = { + .m_la = blknr_to_addr(current_block), + }; + erofs_blk_t blknr; + unsigned blkoff; + + err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + if (unlikely(err)) + goto err_out; + + /* zero out the holed page */ + if (unlikely(!(map.m_flags & EROFS_MAP_MAPPED))) { + zero_user_segment(page, 0, PAGE_SIZE); + SetPageUptodate(page); + + /* imply err = 0, see erofs_map_blocks */ + goto has_updated; + } + + /* for RAW access mode, m_plen must be equal to m_llen */ + BUG_ON(map.m_plen != map.m_llen); + + blknr = erofs_blknr(map.m_pa); + blkoff = erofs_blkoff(map.m_pa); + + /* deal with inline page */ + if (map.m_flags & EROFS_MAP_META) { + void *vsrc, *vto; + struct page *ipage; + + BUG_ON(map.m_plen > PAGE_SIZE); + + ipage = erofs_get_meta_page(inode->i_sb, blknr, 0); + + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto err_out; + } + + vsrc = kmap_atomic(ipage); + vto = kmap_atomic(page); + memcpy(vto, vsrc + blkoff, map.m_plen); + memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen); + kunmap_atomic(vto); + kunmap_atomic(vsrc); + flush_dcache_page(page); + + SetPageUptodate(page); + /* TODO: could we unlock the page earlier? */ + unlock_page(ipage); + put_page(ipage); + + /* imply err = 0, see erofs_map_blocks */ + goto has_updated; + } + + /* pa must be block-aligned for raw reading */ + BUG_ON(erofs_blkoff(map.m_pa) != 0); + + /* max # of continuous pages */ + if (nblocks > DIV_ROUND_UP(map.m_plen, PAGE_SIZE)) + nblocks = DIV_ROUND_UP(map.m_plen, PAGE_SIZE); + if (nblocks > BIO_MAX_PAGES) + nblocks = BIO_MAX_PAGES; + + bio = erofs_grab_bio(inode->i_sb, + blknr, nblocks, read_endio, false); + + if (IS_ERR(bio)) { + err = PTR_ERR(bio); + bio = NULL; + goto err_out; + } + } + + err = bio_add_page(bio, page, PAGE_SIZE, 0); + /* out of the extent or bio is full */ + if (err < PAGE_SIZE) + goto submit_bio_retry; + + *last_block = current_block; + + /* shift in advance in case of it followed by too many gaps */ + if (unlikely(bio->bi_vcnt >= bio->bi_max_vecs)) { + /* err should reassign to 0 after submitting */ + err = 0; + goto submit_bio_out; + } + + return bio; + +err_out: + /* for sync reading, set page error immediately */ + if (!ra) { + SetPageError(page); + ClearPageUptodate(page); + } +has_updated: + unlock_page(page); + + /* if updated manually, continuous pages has a gap */ + if (bio != NULL) +submit_bio_out: + __submit_bio(bio, REQ_OP_READ, 0); + + return unlikely(err) ? ERR_PTR(err) : NULL; +} + +/* + * since we dont have write or truncate flows, so no inode + * locking needs to be held at the moment. + */ +static int erofs_raw_access_readpage(struct file *file, struct page *page) +{ + erofs_off_t last_block; + struct bio *bio; + + trace_erofs_readpage(page, true); + + bio = erofs_read_raw_page(NULL, page->mapping, + page, &last_block, 1, false); + + if (IS_ERR(bio)) + return PTR_ERR(bio); + + BUG_ON(bio != NULL); /* since we have only one bio -- must be NULL */ + return 0; +} + +static int erofs_raw_access_readpages(struct file *filp, + struct address_space *mapping, + struct list_head *pages, unsigned int nr_pages) +{ + erofs_off_t last_block; + struct bio *bio = NULL; + gfp_t gfp = readahead_gfp_mask(mapping); + struct page *page = list_last_entry(pages, struct page, lru); + + trace_erofs_readpages(mapping->host, page, nr_pages, true); + + for (; nr_pages; --nr_pages) { + page = list_entry(pages->prev, struct page, lru); + + prefetchw(&page->flags); + list_del(&page->lru); + + if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) { + bio = erofs_read_raw_page(bio, mapping, page, + &last_block, nr_pages, true); + + /* all the page errors are ignored when readahead */ + if (IS_ERR(bio)) { + pr_err("%s, readahead error at page %lu of nid %llu\n", + __func__, page->index, + EROFS_V(mapping->host)->nid); + + bio = NULL; + } + } + + /* pages could still be locked */ + put_page(page); + } + BUG_ON(!list_empty(pages)); + + /* the rare case (end in gaps) */ + if (unlikely(bio != NULL)) + __submit_bio(bio, REQ_OP_READ, 0); + return 0; +} + +/* for uncompressed (aligned) files and raw access for other files */ +const struct address_space_operations erofs_raw_access_aops = { + .readpage = erofs_raw_access_readpage, + .readpages = erofs_raw_access_readpages, +}; + diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c new file mode 100644 index 000000000000..6bd6ca6583ce --- /dev/null +++ b/fs/erofs/dir.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/drivers/staging/erofs/dir.c + * + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ +#include "internal.h" + +static const unsigned char erofs_filetype_table[EROFS_FT_MAX] = { + [EROFS_FT_UNKNOWN] = DT_UNKNOWN, + [EROFS_FT_REG_FILE] = DT_REG, + [EROFS_FT_DIR] = DT_DIR, + [EROFS_FT_CHRDEV] = DT_CHR, + [EROFS_FT_BLKDEV] = DT_BLK, + [EROFS_FT_FIFO] = DT_FIFO, + [EROFS_FT_SOCK] = DT_SOCK, + [EROFS_FT_SYMLINK] = DT_LNK, +}; + +static int erofs_fill_dentries(struct dir_context *ctx, + void *dentry_blk, unsigned *ofs, + unsigned nameoff, unsigned maxsize) +{ + struct erofs_dirent *de = dentry_blk; + const struct erofs_dirent *end = dentry_blk + nameoff; + + de = dentry_blk + *ofs; + while (de < end) { + const char *de_name; + int de_namelen; + unsigned char d_type; +#ifdef CONFIG_EROFS_FS_DEBUG + unsigned dbg_namelen; + unsigned char dbg_namebuf[EROFS_NAME_LEN]; +#endif + + if (unlikely(de->file_type < EROFS_FT_MAX)) + d_type = erofs_filetype_table[de->file_type]; + else + d_type = DT_UNKNOWN; + + nameoff = le16_to_cpu(de->nameoff); + de_name = (char *)dentry_blk + nameoff; + + de_namelen = unlikely(de + 1 >= end) ? + /* last directory entry */ + strnlen(de_name, maxsize - nameoff) : + le16_to_cpu(de[1].nameoff) - nameoff; + + /* the corrupted directory found */ + BUG_ON(de_namelen < 0); + +#ifdef CONFIG_EROFS_FS_DEBUG + dbg_namelen = min(EROFS_NAME_LEN - 1, de_namelen); + memcpy(dbg_namebuf, de_name, dbg_namelen); + dbg_namebuf[dbg_namelen] = '\0'; + + debugln("%s, found de_name %s de_len %d d_type %d", __func__, + dbg_namebuf, de_namelen, d_type); +#endif + + if (!dir_emit(ctx, de_name, de_namelen, + le64_to_cpu(de->nid), d_type)) + /* stoped by some reason */ + return 1; + ++de; + *ofs += sizeof(struct erofs_dirent); + } + *ofs = maxsize; + return 0; +} + +static int erofs_readdir(struct file *f, struct dir_context *ctx) +{ + struct inode *dir = file_inode(f); + struct address_space *mapping = dir->i_mapping; + const size_t dirsize = i_size_read(dir); + unsigned i = ctx->pos / EROFS_BLKSIZ; + unsigned ofs = ctx->pos % EROFS_BLKSIZ; + int err = 0; + bool initial = true; + + while (ctx->pos < dirsize) { + struct page *dentry_page; + struct erofs_dirent *de; + unsigned nameoff, maxsize; + + dentry_page = read_mapping_page(mapping, i, NULL); + if (dentry_page == ERR_PTR(-ENOMEM)) { + err = -ENOMEM; + break; + } else if (IS_ERR(dentry_page)) { + errln("fail to readdir of logical block %u of nid %llu", + i, EROFS_V(dir)->nid); + err = PTR_ERR(dentry_page); + break; + } + + lock_page(dentry_page); + de = (struct erofs_dirent *)kmap(dentry_page); + + nameoff = le16_to_cpu(de->nameoff); + + if (unlikely(nameoff < sizeof(struct erofs_dirent) || + nameoff >= PAGE_SIZE)) { + errln("%s, invalid de[0].nameoff %u", + __func__, nameoff); + + err = -EIO; + goto skip_this; + } + + maxsize = min_t(unsigned, dirsize - ctx->pos + ofs, PAGE_SIZE); + + /* search dirents at the arbitrary position */ + if (unlikely(initial)) { + initial = false; + + ofs = roundup(ofs, sizeof(struct erofs_dirent)); + if (unlikely(ofs >= nameoff)) + goto skip_this; + } + + err = erofs_fill_dentries(ctx, de, &ofs, nameoff, maxsize); +skip_this: + kunmap(dentry_page); + + unlock_page(dentry_page); + put_page(dentry_page); + + ctx->pos = blknr_to_addr(i) + ofs; + + if (unlikely(err)) + break; + ++i; + ofs = 0; + } + return err < 0 ? err : 0; +} + +const struct file_operations erofs_dir_fops = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = erofs_readdir, +}; + diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h new file mode 100644 index 000000000000..5fcb13592ad8 --- /dev/null +++ b/fs/erofs/erofs_fs.h @@ -0,0 +1,287 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Apache-2.0 + * + * linux/drivers/staging/erofs/erofs_fs.h + * + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang + * + * This file is dual-licensed; you may select either the GNU General Public + * License version 2 or Apache License, Version 2.0. See the file COPYING + * in the main directory of the Linux distribution for more details. + */ +#ifndef __EROFS_FS_H +#define __EROFS_FS_H + +/* Enhanced(Extended) ROM File System */ +#define EROFS_SUPER_MAGIC_V1 0xE0F5E1E2 +#define EROFS_SUPER_OFFSET 1024 + +struct erofs_super_block { +/* 0 */__le32 magic; /* in the little endian */ +/* 4 */__le32 checksum; /* crc32c(super_block) */ +/* 8 */__le32 features; +/* 12 */__u8 blkszbits; /* support block_size == PAGE_SIZE only */ +/* 13 */__u8 reserved; + +/* 14 */__le16 root_nid; +/* 16 */__le64 inos; /* total valid ino # (== f_files - f_favail) */ + +/* 24 */__le64 build_time; /* inode v1 time derivation */ +/* 32 */__le32 build_time_nsec; +/* 36 */__le32 blocks; /* used for statfs */ +/* 40 */__le32 meta_blkaddr; +/* 44 */__le32 xattr_blkaddr; +/* 48 */__u8 uuid[16]; /* 128-bit uuid for volume */ +/* 64 */__u8 volume_name[16]; /* volume name */ + +/* 80 */__u8 reserved2[48]; /* 128 bytes */ +} __packed; + +#define __EROFS_BIT(_prefix, _cur, _pre) enum { \ + _prefix ## _cur ## _BIT = _prefix ## _pre ## _BIT + \ + _prefix ## _pre ## _BITS } + +/* + * erofs inode data mapping: + * 0 - inode plain without inline data A: + * inode, [xattrs], ... | ... | no-holed data + * 1 - inode VLE compression B: + * inode, [xattrs], extents ... | ... + * 2 - inode plain with inline data C: + * inode, [xattrs], last_inline_data, ... | ... | no-holed data + * 3~7 - reserved + */ +enum { + EROFS_INODE_LAYOUT_PLAIN, + EROFS_INODE_LAYOUT_COMPRESSION, + EROFS_INODE_LAYOUT_INLINE, + EROFS_INODE_LAYOUT_MAX +}; +#define EROFS_I_VERSION_BITS 1 +#define EROFS_I_DATA_MAPPING_BITS 3 + +#define EROFS_I_VERSION_BIT 0 +__EROFS_BIT(EROFS_I_, DATA_MAPPING, VERSION); + +struct erofs_inode_v1 { +/* 0 */__le16 i_advise; + +/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ +/* 2 */__le16 i_xattr_icount; +/* 4 */__le16 i_mode; +/* 6 */__le16 i_nlink; +/* 8 */__le32 i_size; +/* 12 */__le32 i_reserved; +/* 16 */union { + /* file total compressed blocks for data mapping 1 */ + __le32 compressed_blocks; + __le32 raw_blkaddr; + + /* for device files, used to indicate old/new device # */ + __le32 rdev; + } i_u __packed; +/* 20 */__le32 i_ino; /* only used for 32-bit stat compatibility */ +/* 24 */__le16 i_uid; +/* 26 */__le16 i_gid; +/* 28 */__le32 i_checksum; +} __packed; + +/* 32 bytes on-disk inode */ +#define EROFS_INODE_LAYOUT_V1 0 +/* 64 bytes on-disk inode */ +#define EROFS_INODE_LAYOUT_V2 1 + +struct erofs_inode_v2 { + __le16 i_advise; + + /* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ + __le16 i_xattr_icount; + __le16 i_mode; + __le16 i_reserved; /* 8 bytes */ + __le64 i_size; /* 16 bytes */ + union { + /* file total compressed blocks for data mapping 1 */ + __le32 compressed_blocks; + __le32 raw_blkaddr; + + /* for device files, used to indicate old/new device # */ + __le32 rdev; + } i_u __packed; + + /* only used for 32-bit stat compatibility */ + __le32 i_ino; /* 24 bytes */ + + __le32 i_uid; + __le32 i_gid; + __le64 i_ctime; /* 32 bytes */ + __le32 i_ctime_nsec; + __le32 i_nlink; + __u8 i_reserved2[12]; + __le32 i_checksum; /* 64 bytes */ +} __packed; + +#define EROFS_MAX_SHARED_XATTRS (128) +/* h_shared_count between 129 ... 255 are special # */ +#define EROFS_SHARED_XATTR_EXTENT (255) + +/* + * inline xattrs (n == i_xattr_icount): + * erofs_xattr_ibody_header(1) + (n - 1) * 4 bytes + * 12 bytes / \ + * / \ + * /-----------------------\ + * | erofs_xattr_entries+ | + * +-----------------------+ + * inline xattrs must starts in erofs_xattr_ibody_header, + * for read-only fs, no need to introduce h_refcount + */ +struct erofs_xattr_ibody_header { + __le32 h_checksum; + __u8 h_shared_count; + __u8 h_reserved[7]; + __le32 h_shared_xattrs[0]; /* shared xattr id array */ +} __packed; + +/* Name indexes */ +#define EROFS_XATTR_INDEX_USER 1 +#define EROFS_XATTR_INDEX_POSIX_ACL_ACCESS 2 +#define EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT 3 +#define EROFS_XATTR_INDEX_TRUSTED 4 +#define EROFS_XATTR_INDEX_LUSTRE 5 +#define EROFS_XATTR_INDEX_SECURITY 6 + +/* xattr entry (for both inline & shared xattrs) */ +struct erofs_xattr_entry { + __u8 e_name_len; /* length of name */ + __u8 e_name_index; /* attribute name index */ + __le16 e_value_size; /* size of attribute value */ + /* followed by e_name and e_value */ + char e_name[0]; /* attribute name */ +} __packed; + +#define ondisk_xattr_ibody_size(count) ({\ + u32 __count = le16_to_cpu(count); \ + ((__count) == 0) ? 0 : \ + sizeof(struct erofs_xattr_ibody_header) + \ + sizeof(__u32) * ((__count) - 1); }) + +#define EROFS_XATTR_ALIGN(size) round_up(size, sizeof(struct erofs_xattr_entry)) +#define EROFS_XATTR_ENTRY_SIZE(entry) EROFS_XATTR_ALIGN( \ + sizeof(struct erofs_xattr_entry) + \ + (entry)->e_name_len + le16_to_cpu((entry)->e_value_size)) + +/* have to be aligned with 8 bytes on disk */ +struct erofs_extent_header { + __le32 eh_checksum; + __le32 eh_reserved[3]; +} __packed; + +/* + * Z_EROFS Variable-sized Logical Extent cluster type: + * 0 - literal (uncompressed) cluster + * 1 - compressed cluster (for the head logical cluster) + * 2 - compressed cluster (for the other logical clusters) + * + * In detail, + * 0 - literal (uncompressed) cluster, + * di_advise = 0 + * di_clusterofs = the literal data offset of the cluster + * di_blkaddr = the blkaddr of the literal cluster + * + * 1 - compressed cluster (for the head logical cluster) + * di_advise = 1 + * di_clusterofs = the decompressed data offset of the cluster + * di_blkaddr = the blkaddr of the compressed cluster + * + * 2 - compressed cluster (for the other logical clusters) + * di_advise = 2 + * di_clusterofs = + * the decompressed data offset in its own head cluster + * di_u.delta[0] = distance to its corresponding head cluster + * di_u.delta[1] = distance to its corresponding tail cluster + * (di_advise could be 0, 1 or 2) +#ifdef CONFIG_EROFS_FS_HUAWEI_EXTENSION + * or di_advise[3:0] = 3 (for 4K, max 1M / 4K = 256) + * {di_advise[7:4], di_pageofs[15:12]} = + * distance to its corresponding head cluster + * di_advise[15:8] = distance to its corresponding tail cluster +#endif + */ +enum { + Z_EROFS_VLE_CLUSTER_TYPE_PLAIN, + Z_EROFS_VLE_CLUSTER_TYPE_HEAD, + Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD, +#ifdef CONFIG_EROFS_FS_HUAWEI_EXTENSION + Z_EROFS_VLE_CLUSTER_TYPE_HUAWEI_COMPAT, +#else + Z_EROFS_VLE_CLUSTER_TYPE_RESERVED, +#endif + Z_EROFS_VLE_CLUSTER_TYPE_MAX +}; + +#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS 2 +#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT 0 + +struct z_erofs_vle_decompressed_index { + __le16 di_advise; + /* where to decompress in the head cluster */ + __le16 di_clusterofs; + + union { + /* for the head cluster */ + __le32 blkaddr; + /* + * for the rest clusters + * eg. for 4k page-sized cluster, maximum 4K*64k = 256M) + * [0] - pointing to the head cluster + * [1] - pointing to the tail cluster + */ + __le16 delta[2]; + } di_u __packed; /* 8 bytes */ +} __packed; + +#define Z_EROFS_VLE_EXTENT_ALIGN(size) round_up(size, \ + sizeof(struct z_erofs_vle_decompressed_index)) + +/* dirent sorts in alphabet order, thus we can do binary search */ +struct erofs_dirent { + __le64 nid; /* 0, node number */ + __le16 nameoff; /* 8, start offset of file name */ + __u8 file_type; /* 10, file type */ + __u8 reserved; /* 11, reserved */ +} __packed; + +/* file types used in inode_info->flags */ +enum { + EROFS_FT_UNKNOWN, + EROFS_FT_REG_FILE, + EROFS_FT_DIR, + EROFS_FT_CHRDEV, + EROFS_FT_BLKDEV, + EROFS_FT_FIFO, + EROFS_FT_SOCK, + EROFS_FT_SYMLINK, + EROFS_FT_MAX +}; + +#define EROFS_NAME_LEN 255 + +/* check the EROFS on-disk layout strictly at compile time */ +static inline void erofs_check_ondisk_layout_definitions(void) +{ + BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128); + BUILD_BUG_ON(sizeof(struct erofs_inode_v1) != 32); + BUILD_BUG_ON(sizeof(struct erofs_inode_v2) != 64); + BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12); + BUILD_BUG_ON(sizeof(struct erofs_xattr_entry) != 4); + BUILD_BUG_ON(sizeof(struct erofs_extent_header) != 16); + BUILD_BUG_ON(sizeof(struct z_erofs_vle_decompressed_index) != 8); + BUILD_BUG_ON(sizeof(struct erofs_dirent) != 12); + + BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) < + Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1); +} + +#endif + diff --git a/fs/erofs/generic/lz4.h b/fs/erofs/generic/lz4.h new file mode 100644 index 000000000000..fa9773621f09 --- /dev/null +++ b/fs/erofs/generic/lz4.h @@ -0,0 +1,426 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * linux/drivers/staging/erofs/generic/lz4.h + * + * Copyright (C) 2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang + * + * Original code taken from 'linux/lib/lz4/lz4_decompress.c' + */ + +/* + * LZ4 - Fast LZ compression algorithm + * Copyright (C) 2011 - 2016, Yann Collet. + * BSD 2 - Clause License (http://www.opensource.org/licenses/bsd - license.php) + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * You can contact the author at : + * - LZ4 homepage : http://www.lz4.org + * - LZ4 source repository : https://github.com/lz4/lz4 + * + * Changed for kernel usage by: + * Sven Schmidt <4sschmid@informatik.uni-hamburg.de> + */ +#ifndef __EROFS_GENERIC_LZ4_H +#define __EROFS_GENERIC_LZ4_H + +#include "../internal.h" +#include +#include /* memset, memcpy */ + +#define FORCE_INLINE __always_inline + +/*-************************************ + * Basic Types + **************************************/ +#include + +typedef uint8_t BYTE; +typedef uint16_t U16; +typedef uint32_t U32; +typedef int32_t S32; +typedef uint64_t U64; +typedef uintptr_t uptrval; + +/*-************************************ + * Architecture specifics + **************************************/ +#if defined(CONFIG_64BIT) +#define LZ4_ARCH64 1 +#else +#define LZ4_ARCH64 0 +#endif + +#if defined(__LITTLE_ENDIAN) +#define LZ4_LITTLE_ENDIAN 1 +#else +#define LZ4_LITTLE_ENDIAN 0 +#endif + +/*-************************************ + * Constants + **************************************/ +#define MINMATCH 4 + +#define WILDCOPYLENGTH 8 +#define LASTLITERALS 5 +#define MFLIMIT (WILDCOPYLENGTH + MINMATCH) + +/* + * ensure it's possible to write 2 x wildcopyLength + * without overflowing output buffer + */ +#define MATCH_SAFEGUARD_DISTANCE ((2 * WILDCOPYLENGTH) - MINMATCH) + +#define HASH_UNIT sizeof(size_t) + +#define KB (1 << 10) +#define MB (1 << 20) +#define GB (1U << 30) + +#define MAXD_LOG 16 +#define MAX_DISTANCE ((1 << MAXD_LOG) - 1) + +#define ML_BITS 4 +#define ML_MASK ((1U << ML_BITS) - 1) +#define RUN_BITS (8 - ML_BITS) +#define RUN_MASK ((1U << RUN_BITS) - 1) + +/*-************************************ + * Reading and writing into memory + **************************************/ +static FORCE_INLINE U16 LZ4_readLE16(const void *memPtr) +{ + return get_unaligned_le16(memPtr); +} + +static FORCE_INLINE void LZ4_copy8(void *dst, const void *src) +{ +#if LZ4_ARCH64 + U64 a = get_unaligned((const U64 *)src); + + put_unaligned(a, (U64 *)dst); +#else + U32 a = get_unaligned((const U32 *)src); + U32 b = get_unaligned((const U32 *)src + 1); + + put_unaligned(a, (U32 *)dst); + put_unaligned(b, (U32 *)dst + 1); +#endif +} + +/* + * customized variant of memcpy, + * which can overwrite up to 7 bytes beyond dstEnd + */ +static FORCE_INLINE void LZ4_wildCopy(void *dstPtr, + const void *srcPtr, void *dstEnd) +{ + BYTE *d = (BYTE *)dstPtr; + const BYTE *s = (const BYTE *)srcPtr; + BYTE *const e = (BYTE *)dstEnd; + + do { + LZ4_copy8(d, s); + d += 8; + s += 8; + } while (d < e); +} + +#define DEBUGLOG(l, ...) {} /* disabled */ +#define LZ4_STATIC_ASSERT(c) BUILD_BUG_ON(!(c)) + +/* + * no public solution to solve our requirement yet. + * see: + * https://groups.google.com/forum/#!topic/lz4c/_3kkz5N6n00 + */ +static FORCE_INLINE int __lz4_decompress_safe_partial( + uint8_t *dst_ptr, + const uint8_t *src_ptr, + BYTE *dst, + int outputSize, + const void *src, + int inputSize, + bool trusted) +{ + /* Local Variables */ + const BYTE *ip = (const BYTE *) src_ptr; + const BYTE *const iend = src + inputSize; + + BYTE *op = (BYTE *) dst_ptr; + BYTE *const oend = dst + outputSize; + BYTE *cpy; + + static const unsigned int inc32table[] = { 0, 1, 2, 1, 0, 4, 4, 4 }; + static const int dec64table[] = { 0, 0, 0, -1, -4, 1, 2, 3 }; + + /* Set up the "end" pointers for the shortcut. */ + const BYTE *const shortiend = iend - 14 /*maxLL*/ - 2 /*offset*/; + const BYTE *const shortoend = oend - 14 /*maxLL*/ - 18 /*maxML*/; + + DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__, + inputSize, outputSize); + + /* Empty output buffer */ + if (unlikely(!outputSize)) + return ((inputSize == 1) && (*ip == 0)) ? 0 : -EINVAL; + + if (unlikely(!inputSize)) + return -EINVAL; + + /* Main Loop : decode sequences */ + while (1) { + size_t length; + const BYTE *match; + size_t offset; + + /* get literal length */ + unsigned int const token = *ip++; + length = token >> ML_BITS; + + /* ip < iend before the increment */ + DBG_BUGON(ip > iend); + /* + * A two-stage shortcut for the most common case: + * 1) If the literal length is 0..14, and there is enough + * space, enter the shortcut and copy 16 bytes on behalf + * of the literals (in the fast mode, only 8 bytes can be + * safely copied this way). + * 2) Further if the match length is 4..18, copy 18 bytes + * in a similar manner; but we ensure that there's enough + * space in the output for those 18 bytes earlier, upon + * entering the shortcut (in other words, there is a + * combined check for both stages). + */ + if (length != RUN_MASK && + /* + * strictly "less than" on input, to re-enter + * the loop with at least one byte + */ + likely((ip < shortiend) & (op <= shortoend))) { + + /* Copy the literals */ + memcpy(op, ip, 16); + op += length; + ip += length; + + /* + * The second stage: + * prepare for match copying, decode full info. + * If it doesn't work out, the info won't be wasted. + */ + length = token & ML_MASK; /* match length */ + offset = LZ4_readLE16(ip); + ip += 2; + match = op - offset; + DBG_BUGON(match > op); /* check overflow */ + + /* Do not deal with overlapping matches. */ + if ((length != ML_MASK) && + (offset >= 8) && (match >= dst)) { + /* Copy the match. */ + LZ4_copy8(op + 0, match + 0); + LZ4_copy8(op + 8, match + 8); + memcpy(op + 16, match + 16, 2); + op += length + MINMATCH; + /* Both stages worked, load the next token. */ + continue; + } + + /* + * The second stage didn't work out, but the info + * is ready. Propel it right to the point of match + * copying. + */ + goto _copy_match; + } + + /* decode literal length */ + if (length == RUN_MASK) { + unsigned int s; + + if (unlikely(!trusted && ip >= iend - RUN_MASK)) { + /* overflow detection */ + goto _output_error; + } + + do { + s = *ip++; + length += s; + } while (likely(ip < iend - RUN_MASK) & (s == 255)); + + if (!trusted) { + if (unlikely((uptrval)(op) + length < (uptrval)op)) + /* overflow detection */ + goto _output_error; + if (unlikely((uptrval)(ip) + length < (uptrval)ip)) + /* overflow detection */ + goto _output_error; + } + } + + /* copy literals */ + cpy = op + length; + LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH); + + if ((cpy > oend - MFLIMIT) + || (ip + length > iend - (2 + 1 + LASTLITERALS))) { + if (cpy > oend) { + /* + * Partial decoding : + * stop in the middle of literal segment + */ + cpy = oend; + length = oend - op; + } + + if (!trusted && ip + length > iend) { + /* + * Error : + * read attempt beyond + * end of input buffer + */ + goto _output_error; + } + + memcpy(op, ip, length); + ip += length; + op += length; + + /* Necessarily EOF, due to parsing restrictions */ + if (cpy == oend) + break; + } else { + /* may overwrite up to WILDCOPYLENGTH beyond cpy */ + LZ4_wildCopy(op, ip, cpy); + ip += length; + op = cpy; + } + + /* get offset */ + offset = LZ4_readLE16(ip); + ip += 2; + match = op - offset; + + /* get matchlength */ + length = token & ML_MASK; + +_copy_match: + if (length == ML_MASK) { + unsigned int s; + + do { + s = *ip++; + + if (!trusted && ip > iend - LASTLITERALS) + goto _output_error; + + length += s; + } while (s == 255); + + if (unlikely(!trusted && + (uptrval)(op) + length < (uptrval)op)) { + /* overflow detection */ + goto _output_error; + } + } + + length += MINMATCH; + + /* copy match within block */ + cpy = op + length; + + /* + * partialDecoding : + * may not respect endBlock parsing restrictions + */ + DBG_BUGON(op > oend); + if (cpy > oend - MATCH_SAFEGUARD_DISTANCE) { + size_t const mlen = min(length, (size_t)(oend - op)); + const BYTE * const matchEnd = match + mlen; + BYTE * const copyEnd = op + mlen; + + if (matchEnd > op) { + /* overlap copy */ + while (op < copyEnd) + *op++ = *match++; + } else { + memcpy(op, match, mlen); + } + op = copyEnd; + if (op == oend) + break; + continue; + } + + if (unlikely(offset < 8)) { + op[0] = match[0]; + op[1] = match[1]; + op[2] = match[2]; + op[3] = match[3]; + match += inc32table[offset]; + memcpy(op + 4, match, 4); + match -= dec64table[offset]; + } else { + LZ4_copy8(op, match); + match += 8; + } + + op += 8; + if (unlikely(cpy > oend - MATCH_SAFEGUARD_DISTANCE)) { + BYTE * const oCopyLimit = oend - (WILDCOPYLENGTH - 1); + + if (!trusted && cpy > oend - LASTLITERALS) { + /* + * Error : last LASTLITERALS bytes + * must be literals (uncompressed) + */ + goto _output_error; + } + + if (op < oCopyLimit) { + LZ4_wildCopy(op, match, oCopyLimit); + match += oCopyLimit - op; + op = oCopyLimit; + } + while (op < cpy) + *op++ = *match++; + } else { + LZ4_copy8(op, match); + if (length > 16) + LZ4_wildCopy(op + 8, match + 8, cpy); + } + op = cpy; /* wildcopy correction */ + } + + /* end of decoding */ + /* Nb of output bytes decoded */ + return (int) (((BYTE *)op) - dst); + + /* Overflow error detected */ +_output_error: + return -ERANGE; +} + +#endif + diff --git a/fs/erofs/include/linux/tagptr.h b/fs/erofs/include/linux/tagptr.h new file mode 100644 index 000000000000..6f9c2f45ed71 --- /dev/null +++ b/fs/erofs/include/linux/tagptr.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Tagged pointer implementation + * + * Copyright (C) 2018 Gao Xiang + */ +#ifndef _LINUX_TAGPTR_H +#define _LINUX_TAGPTR_H + +#include +#include + +/* + * the name of tagged pointer types are tagptr{1, 2, 3...}_t + * avoid directly using the internal structs __tagptr{1, 2, 3...} + */ +#define __MAKE_TAGPTR(n) \ +typedef struct __tagptr##n { \ + uintptr_t v; \ +} tagptr##n##_t; + +__MAKE_TAGPTR(1) +__MAKE_TAGPTR(2) +__MAKE_TAGPTR(3) +__MAKE_TAGPTR(4) + +#undef __MAKE_TAGPTR + +extern void __compiletime_error("bad tagptr tags") + __bad_tagptr_tags(void); + +extern void __compiletime_error("bad tagptr type") + __bad_tagptr_type(void); + +/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */ +#define __tagptr_mask_1(ptr, n) \ + __builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \ + (1UL << (n)) - 1 : + +#define __tagptr_mask(ptr) (\ + __tagptr_mask_1(ptr, 1) ( \ + __tagptr_mask_1(ptr, 2) ( \ + __tagptr_mask_1(ptr, 3) ( \ + __tagptr_mask_1(ptr, 4) ( \ + __bad_tagptr_type(), 0))))) + +/* generate a tagged pointer from a raw value */ +#define tagptr_init(type, val) \ + ((typeof(type)){ .v = (uintptr_t)(val) }) + +/* + * directly cast a tagged pointer to the native pointer type, which + * could be used for backward compatibility of existing code. + */ +#define tagptr_cast_ptr(tptr) ((void *)(tptr).v) + +/* encode tagged pointers */ +#define tagptr_fold(type, ptr, _tags) ({ \ + const typeof(_tags) tags = (_tags); \ + if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \ + __bad_tagptr_tags(); \ +tagptr_init(type, (uintptr_t)(ptr) | tags); }) + +/* decode tagged pointers */ +#define tagptr_unfold_ptr(tptr) \ + ((void *)((tptr).v & ~__tagptr_mask(tptr))) + +#define tagptr_unfold_tags(tptr) \ + ((tptr).v & __tagptr_mask(tptr)) + +/* operations for the tagger pointer */ +#define tagptr_eq(_tptr1, _tptr2) ({ \ + typeof(_tptr1) tptr1 = (_tptr1); \ + typeof(_tptr2) tptr2 = (_tptr2); \ + (void)(&tptr1 == &tptr2); \ +(tptr1).v == (tptr2).v; }) + +/* lock-free CAS operation */ +#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \ + typeof(_ptptr) ptptr = (_ptptr); \ + typeof(_o) o = (_o); \ + typeof(_n) n = (_n); \ + (void)(&o == &n); \ + (void)(&o == ptptr); \ +tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); }) + +/* wrap WRITE_ONCE if atomic update is needed */ +#define tagptr_replace_tags(_ptptr, tags) ({ \ + typeof(_ptptr) ptptr = (_ptptr); \ + *ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \ +*ptptr; }) + +#define tagptr_set_tags(_ptptr, _tags) ({ \ + typeof(_ptptr) ptptr = (_ptptr); \ + const typeof(_tags) tags = (_tags); \ + if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \ + __bad_tagptr_tags(); \ + ptptr->v |= tags; \ +*ptptr; }) + +#define tagptr_clear_tags(_ptptr, _tags) ({ \ + typeof(_ptptr) ptptr = (_ptptr); \ + const typeof(_tags) tags = (_tags); \ + if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \ + __bad_tagptr_tags(); \ + ptptr->v &= ~tags; \ +*ptptr; }) + +#endif + diff --git a/fs/erofs/include/trace/events/erofs.h b/fs/erofs/include/trace/events/erofs.h new file mode 100644 index 000000000000..0a640323961c --- /dev/null +++ b/fs/erofs/include/trace/events/erofs.h @@ -0,0 +1,256 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM erofs + +#if !defined(_TRACE_EROFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_EROFS_H + +#include + +#define show_dev(dev) MAJOR(dev), MINOR(dev) +#define show_dev_nid(entry) show_dev(entry->dev), entry->nid + +#define show_file_type(type) \ + __print_symbolic(type, \ + { 0, "FILE" }, \ + { 1, "DIR" }) + +#define show_map_flags(flags) __print_flags(flags, "|", \ + { EROFS_GET_BLOCKS_RAW, "RAW" }) + +#define show_mflags(flags) __print_flags(flags, "", \ + { EROFS_MAP_MAPPED, "M" }, \ + { EROFS_MAP_META, "I" }, \ + { EROFS_MAP_ZIPPED, "Z" }) + +TRACE_EVENT(erofs_lookup, + + TP_PROTO(struct inode *dir, struct dentry *dentry, unsigned int flags), + + TP_ARGS(dir, dentry, flags), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(erofs_nid_t, nid ) + __field(const char *, name ) + __field(unsigned int, flags ) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->nid = EROFS_V(dir)->nid; + __entry->name = dentry->d_name.name; + __entry->flags = flags; + ), + + TP_printk("dev = (%d,%d), pnid = %llu, name:%s, flags:%x", + show_dev_nid(__entry), + __entry->name, + __entry->flags) +); + +TRACE_EVENT(erofs_fill_inode, + TP_PROTO(struct inode *inode, int isdir), + TP_ARGS(inode, isdir), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(erofs_nid_t, nid ) + __field(erofs_blk_t, blkaddr ) + __field(unsigned int, ofs ) + __field(int, isdir ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->nid = EROFS_V(inode)->nid; + __entry->blkaddr = erofs_blknr(iloc(EROFS_I_SB(inode), __entry->nid)); + __entry->ofs = erofs_blkoff(iloc(EROFS_I_SB(inode), __entry->nid)); + __entry->isdir = isdir; + ), + + TP_printk("dev = (%d,%d), nid = %llu, blkaddr %u ofs %u, isdir %d", + show_dev_nid(__entry), + __entry->blkaddr, __entry->ofs, + __entry->isdir) +); + +TRACE_EVENT(erofs_readpage, + + TP_PROTO(struct page *page, bool raw), + + TP_ARGS(page, raw), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(erofs_nid_t, nid ) + __field(int, dir ) + __field(pgoff_t, index ) + __field(int, uptodate) + __field(bool, raw ) + ), + + TP_fast_assign( + __entry->dev = page->mapping->host->i_sb->s_dev; + __entry->nid = EROFS_V(page->mapping->host)->nid; + __entry->dir = S_ISDIR(page->mapping->host->i_mode); + __entry->index = page->index; + __entry->uptodate = PageUptodate(page); + __entry->raw = raw; + ), + + TP_printk("dev = (%d,%d), nid = %llu, %s, index = %lu, uptodate = %d " + "raw = %d", + show_dev_nid(__entry), + show_file_type(__entry->dir), + (unsigned long)__entry->index, + __entry->uptodate, + __entry->raw) +); + +TRACE_EVENT(erofs_readpages, + + TP_PROTO(struct inode *inode, struct page *page, unsigned int nrpage, + bool raw), + + TP_ARGS(inode, page, nrpage, raw), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(erofs_nid_t, nid ) + __field(pgoff_t, start ) + __field(unsigned int, nrpage ) + __field(bool, raw ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->nid = EROFS_V(inode)->nid; + __entry->start = page->index; + __entry->nrpage = nrpage; + __entry->raw = raw; + ), + + TP_printk("dev = (%d,%d), nid = %llu, start = %lu nrpage = %u raw = %d", + show_dev_nid(__entry), + (unsigned long)__entry->start, + __entry->nrpage, + __entry->raw) +); + +DECLARE_EVENT_CLASS(erofs__map_blocks_enter, + TP_PROTO(struct inode *inode, struct erofs_map_blocks *map, + unsigned int flags), + + TP_ARGS(inode, map, flags), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( erofs_nid_t, nid ) + __field( erofs_off_t, la ) + __field( u64, llen ) + __field( unsigned int, flags ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->nid = EROFS_V(inode)->nid; + __entry->la = map->m_la; + __entry->llen = map->m_llen; + __entry->flags = flags; + ), + + TP_printk("dev = (%d,%d), nid = %llu, la %llu llen %llu flags %s", + show_dev_nid(__entry), + __entry->la, __entry->llen, + __entry->flags ? show_map_flags(__entry->flags) : "NULL") +); + +DEFINE_EVENT(erofs__map_blocks_enter, erofs_map_blocks_flatmode_enter, + TP_PROTO(struct inode *inode, struct erofs_map_blocks *map, + unsigned flags), + + TP_ARGS(inode, map, flags) +); + +DEFINE_EVENT(erofs__map_blocks_enter, z_erofs_map_blocks_iter_enter, + TP_PROTO(struct inode *inode, struct erofs_map_blocks *map, + unsigned flags), + + TP_ARGS(inode, map, flags) +); + +DECLARE_EVENT_CLASS(erofs__map_blocks_exit, + TP_PROTO(struct inode *inode, struct erofs_map_blocks *map, + unsigned int flags, int ret), + + TP_ARGS(inode, map, flags, ret), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( erofs_nid_t, nid ) + __field( unsigned int, flags ) + __field( erofs_off_t, la ) + __field( erofs_off_t, pa ) + __field( u64, llen ) + __field( u64, plen ) + __field( unsigned int, mflags ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->nid = EROFS_V(inode)->nid; + __entry->flags = flags; + __entry->la = map->m_la; + __entry->pa = map->m_pa; + __entry->llen = map->m_llen; + __entry->plen = map->m_plen; + __entry->mflags = map->m_flags; + __entry->ret = ret; + ), + + TP_printk("dev = (%d,%d), nid = %llu, flags %s " + "la %llu pa %llu llen %llu plen %llu mflags %s ret %d", + show_dev_nid(__entry), + __entry->flags ? show_map_flags(__entry->flags) : "NULL", + __entry->la, __entry->pa, __entry->llen, __entry->plen, + show_mflags(__entry->mflags), __entry->ret) +); + +DEFINE_EVENT(erofs__map_blocks_exit, erofs_map_blocks_flatmode_exit, + TP_PROTO(struct inode *inode, struct erofs_map_blocks *map, + unsigned flags, int ret), + + TP_ARGS(inode, map, flags, ret) +); + +DEFINE_EVENT(erofs__map_blocks_exit, z_erofs_map_blocks_iter_exit, + TP_PROTO(struct inode *inode, struct erofs_map_blocks *map, + unsigned flags, int ret), + + TP_ARGS(inode, map, flags, ret) +); + +TRACE_EVENT(erofs_destroy_inode, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( erofs_nid_t, nid ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->nid = EROFS_V(inode)->nid; + ), + + TP_printk("dev = (%d,%d), nid = %llu", show_dev_nid(__entry)) +); + +#endif /* _TRACE_EROFS_H */ + + /* This part must be outside protection */ +#include diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c new file mode 100644 index 000000000000..d00273fb4887 --- /dev/null +++ b/fs/erofs/inode.c @@ -0,0 +1,332 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/drivers/staging/erofs/inode.c + * + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ +#include "xattr.h" + +#include + +/* no locking */ +static int read_inode(struct inode *inode, void *data) +{ + struct erofs_vnode *vi = EROFS_V(inode); + struct erofs_inode_v1 *v1 = data; + const unsigned advise = le16_to_cpu(v1->i_advise); + + vi->data_mapping_mode = __inode_data_mapping(advise); + + if (unlikely(vi->data_mapping_mode >= EROFS_INODE_LAYOUT_MAX)) { + errln("unknown data mapping mode %u of nid %llu", + vi->data_mapping_mode, vi->nid); + DBG_BUGON(1); + return -EIO; + } + + if (__inode_version(advise) == EROFS_INODE_LAYOUT_V2) { + struct erofs_inode_v2 *v2 = data; + + vi->inode_isize = sizeof(struct erofs_inode_v2); + vi->xattr_isize = ondisk_xattr_ibody_size(v2->i_xattr_icount); + + inode->i_mode = le16_to_cpu(v2->i_mode); + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) { + vi->raw_blkaddr = le32_to_cpu(v2->i_u.raw_blkaddr); + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + inode->i_rdev = + new_decode_dev(le32_to_cpu(v2->i_u.rdev)); + } else if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + inode->i_rdev = 0; + } else { + return -EIO; + } + + i_uid_write(inode, le32_to_cpu(v2->i_uid)); + i_gid_write(inode, le32_to_cpu(v2->i_gid)); + set_nlink(inode, le32_to_cpu(v2->i_nlink)); + + /* ns timestamp */ + inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = + le64_to_cpu(v2->i_ctime); + inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = + le32_to_cpu(v2->i_ctime_nsec); + + inode->i_size = le64_to_cpu(v2->i_size); + } else if (__inode_version(advise) == EROFS_INODE_LAYOUT_V1) { + struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); + + vi->inode_isize = sizeof(struct erofs_inode_v1); + vi->xattr_isize = ondisk_xattr_ibody_size(v1->i_xattr_icount); + + inode->i_mode = le16_to_cpu(v1->i_mode); + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) { + vi->raw_blkaddr = le32_to_cpu(v1->i_u.raw_blkaddr); + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + inode->i_rdev = + new_decode_dev(le32_to_cpu(v1->i_u.rdev)); + } else if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + inode->i_rdev = 0; + } else { + return -EIO; + } + + i_uid_write(inode, le16_to_cpu(v1->i_uid)); + i_gid_write(inode, le16_to_cpu(v1->i_gid)); + set_nlink(inode, le16_to_cpu(v1->i_nlink)); + + /* use build time to derive all file time */ + inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = + sbi->build_time; + inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = + sbi->build_time_nsec; + + inode->i_size = le32_to_cpu(v1->i_size); + } else { + errln("unsupported on-disk inode version %u of nid %llu", + __inode_version(advise), vi->nid); + DBG_BUGON(1); + return -EIO; + } + + /* measure inode.i_blocks as the generic filesystem */ + inode->i_blocks = ((inode->i_size - 1) >> 9) + 1; + return 0; +} + +/* + * try_lock can be required since locking order is: + * file data(fs_inode) + * meta(bd_inode) + * but the majority of the callers is "iget", + * in that case we are pretty sure no deadlock since + * no data operations exist. However I tend to + * try_lock since it takes no much overhead and + * will success immediately. + */ +static int fill_inline_data(struct inode *inode, void *data, unsigned m_pofs) +{ + struct erofs_vnode *vi = EROFS_V(inode); + struct erofs_sb_info *sbi = EROFS_I_SB(inode); + int mode = vi->data_mapping_mode; + + DBG_BUGON(mode >= EROFS_INODE_LAYOUT_MAX); + + /* should be inode inline C */ + if (mode != EROFS_INODE_LAYOUT_INLINE) + return 0; + + /* fast symlink (following ext4) */ + if (S_ISLNK(inode->i_mode) && inode->i_size < PAGE_SIZE) { + char *lnk = erofs_kmalloc(sbi, inode->i_size + 1, GFP_KERNEL); + + if (unlikely(lnk == NULL)) + return -ENOMEM; + + m_pofs += vi->inode_isize + vi->xattr_isize; + BUG_ON(m_pofs + inode->i_size > PAGE_SIZE); + + /* get in-page inline data */ + memcpy(lnk, data + m_pofs, inode->i_size); + lnk[inode->i_size] = '\0'; + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 2, 0)) + vi->i_link = lnk; +#else + inode->i_link = lnk; +#endif + set_inode_fast_symlink(inode); + } + return -EAGAIN; +} + +static int fill_inode(struct inode *inode, int isdir) +{ + struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); + struct erofs_vnode *vi = EROFS_V(inode); + struct page *page; + void *data; + int err; + erofs_blk_t blkaddr; + unsigned ofs; + + trace_erofs_fill_inode(inode, isdir); + + blkaddr = erofs_blknr(iloc(sbi, vi->nid)); + ofs = erofs_blkoff(iloc(sbi, vi->nid)); + + debugln("%s, reading inode nid %llu at %u of blkaddr %u", + __func__, vi->nid, ofs, blkaddr); + + page = erofs_get_meta_page(inode->i_sb, blkaddr, isdir); + + if (IS_ERR(page)) { + errln("failed to get inode (nid: %llu) page, err %ld", + vi->nid, PTR_ERR(page)); + return PTR_ERR(page); + } + + BUG_ON(!PageUptodate(page)); + data = page_address(page); + + err = read_inode(inode, data + ofs); + if (!err) { + /* setup the new inode */ + if (S_ISREG(inode->i_mode)) { +#ifdef CONFIG_EROFS_FS_XATTR + if (vi->xattr_isize) + inode->i_op = &erofs_generic_xattr_iops; +#endif + inode->i_fop = &generic_ro_fops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = +#ifdef CONFIG_EROFS_FS_XATTR + vi->xattr_isize ? &erofs_dir_xattr_iops : +#endif + &erofs_dir_iops; + inode->i_fop = &erofs_dir_fops; + } else if (S_ISLNK(inode->i_mode)) { + /* by default, page_get_link is used for symlink */ + inode->i_op = +#ifdef CONFIG_EROFS_FS_XATTR + &erofs_symlink_xattr_iops, +#else + &page_symlink_inode_operations; +#endif + inode_nohighmem(inode); + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { +#ifdef CONFIG_EROFS_FS_XATTR + inode->i_op = &erofs_special_inode_operations; +#endif + init_special_inode(inode, inode->i_mode, inode->i_rdev); + } else { + err = -EIO; + goto out_unlock; + } + + if (is_inode_layout_compression(inode)) { +#ifdef CONFIG_EROFS_FS_ZIP + inode->i_mapping->a_ops = + &z_erofs_vle_normalaccess_aops; +#else + err = -ENOTSUPP; +#endif + goto out_unlock; + } + + inode->i_mapping->a_ops = &erofs_raw_access_aops; + + /* fill last page if inline data is available */ + fill_inline_data(inode, data, ofs); + } + +out_unlock: + unlock_page(page); + put_page(page); + return err; +} + +struct inode *erofs_iget(struct super_block *sb, + erofs_nid_t nid, bool isdir) +{ + struct inode *inode = iget_locked(sb, nid); + + if (unlikely(inode == NULL)) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) { + int err; + struct erofs_vnode *vi = EROFS_V(inode); + vi->nid = nid; + + err = fill_inode(inode, isdir); + if (likely(!err)) + unlock_new_inode(inode); + else { + iget_failed(inode); + inode = ERR_PTR(err); + } + } + return inode; +} + +#ifdef CONFIG_EROFS_FS_XATTR +const struct inode_operations erofs_generic_xattr_iops = { +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 9, 0)) + .getxattr = generic_getxattr, +#endif + .listxattr = erofs_listxattr, +}; +#endif + +#ifdef CONFIG_EROFS_FS_XATTR +const struct inode_operations erofs_symlink_xattr_iops = { +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0)) + .readlink = generic_readlink, +#endif +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)) + .follow_link = page_follow_link_light, + .put_link = page_put_link, +#else + .get_link = page_get_link, +#endif +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 9, 0)) + .getxattr = generic_getxattr, +#endif + .listxattr = erofs_listxattr, +}; + +const struct inode_operations erofs_special_inode_operations = { + .listxattr = erofs_listxattr, +}; + +#endif + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 2, 0)) +#include + +static void *erofs_follow_fast_link(struct dentry *dentry, struct nameidata *nd) +{ + struct erofs_vnode *vi = EROFS_V(d_inode(dentry)); + + nd_set_link(nd, (char *)vi->i_link); + return NULL; +} + +const struct inode_operations simple_symlink_inode_operations = { + .follow_link = erofs_follow_fast_link, + .readlink = generic_readlink +}; +#endif + +#ifdef CONFIG_EROFS_FS_XATTR +const struct inode_operations erofs_fast_symlink_xattr_iops = { +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0)) + .readlink = generic_readlink, +#endif + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 2, 0)) + .follow_link = erofs_follow_fast_link, +#else +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)) + .follow_link = simple_follow_link, +#else + .get_link = simple_get_link, +#endif +#endif +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 9, 0)) + .getxattr = generic_getxattr, +#endif + .listxattr = erofs_listxattr, +}; +#endif + diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h new file mode 100644 index 000000000000..d1c96138e08a --- /dev/null +++ b/fs/erofs/internal.h @@ -0,0 +1,646 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * linux/drivers/staging/erofs/internal.h + * + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ +#ifndef __INTERNAL_H +#define __INTERNAL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "erofs_fs.h" + +#include "staging.h" + +/* redefine pr_fmt "erofs: " */ +#undef pr_fmt +#define pr_fmt(fmt) "erofs: " fmt + +#define errln(x, ...) pr_err(x "\n", ##__VA_ARGS__) +#define infoln(x, ...) pr_info(x "\n", ##__VA_ARGS__) +#ifdef CONFIG_EROFS_FS_DEBUG +#define debugln(x, ...) pr_debug(x "\n", ##__VA_ARGS__) + +#define dbg_might_sleep might_sleep +#define DBG_BUGON BUG_ON +#else +#define debugln(x, ...) ((void)0) + +#define dbg_might_sleep() ((void)0) +#define DBG_BUGON(...) ((void)0) +#endif + +#ifdef CONFIG_EROFS_FAULT_INJECTION +enum { + FAULT_KMALLOC, + FAULT_MAX, +}; + +extern char *erofs_fault_name[FAULT_MAX]; +#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type))) + +struct erofs_fault_info { + atomic_t inject_ops; + unsigned int inject_rate; + unsigned int inject_type; +}; +#endif + +#ifdef CONFIG_EROFS_FS_ZIP_CACHE_BIPOLAR +#define EROFS_FS_ZIP_CACHE_LVL (2) +#elif defined(EROFS_FS_ZIP_CACHE_UNIPOLAR) +#define EROFS_FS_ZIP_CACHE_LVL (1) +#else +#define EROFS_FS_ZIP_CACHE_LVL (0) +#endif + +#if (!defined(EROFS_FS_HAS_MANAGED_CACHE) && (EROFS_FS_ZIP_CACHE_LVL > 0)) +#define EROFS_FS_HAS_MANAGED_CACHE +#endif + +/* EROFS_SUPER_MAGIC_V1 to represent the whole file system */ +#define EROFS_SUPER_MAGIC EROFS_SUPER_MAGIC_V1 + +typedef u64 erofs_nid_t; + +struct erofs_sb_info { + /* list for all registered superblocks, mainly for shrinker */ + struct list_head list; + struct mutex umount_mutex; + + u32 blocks; + u32 meta_blkaddr; +#ifdef CONFIG_EROFS_FS_XATTR + u32 xattr_blkaddr; +#endif + + /* inode slot unit size in bit shift */ + unsigned char islotbits; +#ifdef CONFIG_EROFS_FS_ZIP + /* cluster size in bit shift */ + unsigned char clusterbits; + + /* the dedicated workstation for compression */ + struct { + struct radix_tree_root tree; +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0)) + spinlock_t lock; +#endif + } workstn; + + /* threshold for decompression synchronously */ + unsigned int max_sync_decompress_pages; + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + struct inode *managed_cache; +#endif + +#endif + + u32 build_time_nsec; + u64 build_time; + + /* what we really care is nid, rather than ino.. */ + erofs_nid_t root_nid; + /* used for statfs, f_files - f_favail */ + u64 inos; + + u8 uuid[16]; /* 128-bit uuid for volume */ + u8 volume_name[16]; /* volume name */ + char *dev_name; + + unsigned int mount_opt; + unsigned int shrinker_run_no; + +#ifdef CONFIG_EROFS_FAULT_INJECTION + struct erofs_fault_info fault_info; /* For fault injection */ +#endif +}; + +#ifdef CONFIG_EROFS_FAULT_INJECTION +#define erofs_show_injection_info(type) \ + infoln("inject %s in %s of %pS", erofs_fault_name[type], \ + __func__, __builtin_return_address(0)) + +static inline bool time_to_inject(struct erofs_sb_info *sbi, int type) +{ + struct erofs_fault_info *ffi = &sbi->fault_info; + + if (!ffi->inject_rate) + return false; + + if (!IS_FAULT_SET(ffi, type)) + return false; + + atomic_inc(&ffi->inject_ops); + if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { + atomic_set(&ffi->inject_ops, 0); + return true; + } + return false; +} +#endif + +static inline void *erofs_kmalloc(struct erofs_sb_info *sbi, + size_t size, gfp_t flags) +{ +#ifdef CONFIG_EROFS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_KMALLOC)) { + erofs_show_injection_info(FAULT_KMALLOC); + return NULL; + } +#endif + return kmalloc(size, flags); +} + +#define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info) +#define EROFS_I_SB(inode) ((struct erofs_sb_info *)(inode)->i_sb->s_fs_info) + +/* Mount flags set via mount options or defaults */ +#define EROFS_MOUNT_XATTR_USER 0x00000010 +#define EROFS_MOUNT_POSIX_ACL 0x00000020 +#define EROFS_MOUNT_FAULT_INJECTION 0x00000040 +#define EROFS_MOUNT_LZ4ASM 0x01000000 + +#define clear_opt(sbi, option) ((sbi)->mount_opt &= ~EROFS_MOUNT_##option) +#define set_opt(sbi, option) ((sbi)->mount_opt |= EROFS_MOUNT_##option) +#define test_opt(sbi, option) ((sbi)->mount_opt & EROFS_MOUNT_##option) + +#ifdef CONFIG_EROFS_FS_ZIP +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0)) +#define erofs_workstn_lock(sbi) spin_lock(&(sbi)->workstn.lock) +#define erofs_workstn_unlock(sbi) spin_unlock(&(sbi)->workstn.lock) +#else +#define erofs_workstn_lock(sbi) xa_lock(&(sbi)->workstn.tree) +#define erofs_workstn_unlock(sbi) xa_unlock(&(sbi)->workstn.tree) +#endif + +/* basic unit of the workstation of a super_block */ +struct erofs_workgroup { + /* the workgroup index in the workstation */ + pgoff_t index; + + /* overall workgroup reference count */ + atomic_t refcount; +}; + +#define EROFS_LOCKED_MAGIC (INT_MIN | 0xE0F510CCL) + +static inline bool erofs_workgroup_try_to_freeze( + struct erofs_workgroup *grp, int v) +{ +#if defined(CONFIG_SMP) + if (v != atomic_cmpxchg(&grp->refcount, + v, EROFS_LOCKED_MAGIC)) + return false; + preempt_disable(); +#else + preempt_disable(); + if (atomic_read(&grp->refcount) != v) { + preempt_enable(); + return false; + } +#endif + return true; +} + +static inline void erofs_workgroup_unfreeze( + struct erofs_workgroup *grp, int v) +{ +#if defined(CONFIG_SMP) + smp_mb(); /* atomic_set doesn't imply a barrier */ + atomic_set(&grp->refcount, v); +#endif + preempt_enable(); +} + +static inline bool erofs_workgroup_get(struct erofs_workgroup *grp, int *ocnt) +{ + const int locked = (int)EROFS_LOCKED_MAGIC; + int o; + +repeat: + o = atomic_read(&grp->refcount); + + /* spin if it is temporarily locked at the reclaim path */ + if (unlikely(o == locked)) { +#if defined(CONFIG_SMP) + do + cpu_relax(); + while (atomic_read(&grp->refcount) == locked); +#endif + goto repeat; + } + + if (unlikely(o <= 0)) + return -1; + + if (unlikely(atomic_cmpxchg(&grp->refcount, o, o + 1) != o)) + goto repeat; + + *ocnt = o; + return 0; +} + +#define __erofs_workgroup_get(grp) atomic_inc(&(grp)->refcount) + +extern int erofs_workgroup_put(struct erofs_workgroup *grp); + +extern struct erofs_workgroup *erofs_find_workgroup( + struct super_block *sb, pgoff_t index, bool *tag); + +extern int erofs_register_workgroup(struct super_block *sb, + struct erofs_workgroup *grp, bool tag); + +extern unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, + unsigned long nr_shrink, bool cleanup); + +static inline void erofs_workstation_cleanup_all(struct super_block *sb) +{ + erofs_shrink_workstation(EROFS_SB(sb), ~0UL, true); +} + +#ifdef EROFS_FS_HAS_MANAGED_CACHE +#define EROFS_UNALLOCATED_CACHED_PAGE ((void *)0x5F0EF00D) + +static inline struct address_space *MNGD_MAPPING(struct erofs_sb_info *sbi) +{ + return sbi->managed_cache->i_mapping; +} + +extern int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, + struct erofs_workgroup *egrp); +extern int erofs_try_to_free_cached_page(struct address_space *mapping, + struct page *page); +#ifdef CONFIG_MIGRATION +int erofs_migrate_cached_page(struct address_space *mapping, + struct page *newpage, struct page *page, + enum migrate_mode mode); +#endif +#endif + +#define DEFAULT_MAX_SYNC_DECOMPRESS_PAGES 3 + +static inline bool __should_decompress_synchronously(struct erofs_sb_info *sbi, + unsigned int nr) +{ + return nr <= sbi->max_sync_decompress_pages; +} + +#endif + +/* we strictly follow PAGE_SIZE and no buffer head yet */ +#define LOG_BLOCK_SIZE PAGE_SHIFT + +#undef LOG_SECTORS_PER_BLOCK +#define LOG_SECTORS_PER_BLOCK (PAGE_SHIFT - 9) + +#undef SECTORS_PER_BLOCK +#define SECTORS_PER_BLOCK (1 << SECTORS_PER_BLOCK) + +#define EROFS_BLKSIZ (1 << LOG_BLOCK_SIZE) + +#if (EROFS_BLKSIZ % 4096 || !EROFS_BLKSIZ) +#error erofs cannot be used in this platform +#endif + +#define ROOT_NID(sb) ((sb)->root_nid) + +#ifdef CONFIG_EROFS_FS_ZIP +/* hard limit of pages per compressed cluster */ +#define Z_EROFS_CLUSTER_MAX_PAGES (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT) + +/* page count of a compressed cluster */ +#define erofs_clusterpages(sbi) ((1 << (sbi)->clusterbits) / PAGE_SIZE) +#endif + +typedef u64 erofs_off_t; + +/* data type for filesystem-wide blocks number */ +typedef u32 erofs_blk_t; + +#define erofs_blknr(addr) ((addr) / EROFS_BLKSIZ) +#define erofs_blkoff(addr) ((addr) % EROFS_BLKSIZ) +#define blknr_to_addr(nr) ((erofs_off_t)(nr) * EROFS_BLKSIZ) + +static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid) +{ + return blknr_to_addr(sbi->meta_blkaddr) + (nid << sbi->islotbits); +} + +#define inode_set_inited_xattr(inode) (EROFS_V(inode)->flags |= 1) +#define inode_has_inited_xattr(inode) (EROFS_V(inode)->flags & 1) + +struct erofs_vnode { + erofs_nid_t nid; + unsigned int flags; + + unsigned char data_mapping_mode; + /* inline size in bytes */ + unsigned char inode_isize; + unsigned short xattr_isize; + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 2, 0)) + char *i_link; +#endif + unsigned xattr_shared_count; + unsigned *xattr_shared_xattrs; + + erofs_blk_t raw_blkaddr; + + /* the corresponding vfs inode */ + struct inode vfs_inode; +}; + +#define EROFS_V(ptr) \ + container_of(ptr, struct erofs_vnode, vfs_inode) + +#define __inode_advise(x, bit, bits) \ + (((x) >> (bit)) & ((1 << (bits)) - 1)) + +#define __inode_version(advise) \ + __inode_advise(advise, EROFS_I_VERSION_BIT, \ + EROFS_I_VERSION_BITS) + +#define __inode_data_mapping(advise) \ + __inode_advise(advise, EROFS_I_DATA_MAPPING_BIT,\ + EROFS_I_DATA_MAPPING_BITS) + +static inline unsigned long inode_datablocks(struct inode *inode) +{ + /* since i_size cannot be changed */ + return DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); +} + +static inline bool is_inode_layout_plain(struct inode *inode) +{ + return EROFS_V(inode)->data_mapping_mode == EROFS_INODE_LAYOUT_PLAIN; +} + +static inline bool is_inode_layout_compression(struct inode *inode) +{ + return EROFS_V(inode)->data_mapping_mode == + EROFS_INODE_LAYOUT_COMPRESSION; +} + +static inline bool is_inode_layout_inline(struct inode *inode) +{ + return EROFS_V(inode)->data_mapping_mode == EROFS_INODE_LAYOUT_INLINE; +} + +extern const struct super_operations erofs_sops; +extern const struct inode_operations erofs_dir_iops; +extern const struct file_operations erofs_dir_fops; + +extern const struct address_space_operations erofs_raw_access_aops; +#ifdef CONFIG_EROFS_FS_ZIP +extern const struct address_space_operations z_erofs_vle_normalaccess_aops; +#endif + +/* + * Logical to physical block mapping, used by erofs_map_blocks() + * + * Different with other file systems, it is used for 2 access modes: + * + * 1) RAW access mode: + * + * Users pass a valid (m_lblk, m_lofs -- usually 0) pair, + * and get the valid m_pblk, m_pofs and the longest m_len(in bytes). + * + * Note that m_lblk in the RAW access mode refers to the number of + * the compressed ondisk block rather than the uncompressed + * in-memory block for the compressed file. + * + * m_pofs equals to m_lofs except for the inline data page. + * + * 2) Normal access mode: + * + * If the inode is not compressed, it has no difference with + * the RAW access mode. However, if the inode is compressed, + * users should pass a valid (m_lblk, m_lofs) pair, and get + * the needed m_pblk, m_pofs, m_len to get the compressed data + * and the updated m_lblk, m_lofs which indicates the start + * of the corresponding uncompressed data in the file. + */ +enum { + BH_Zipped = BH_PrivateStart, +}; + +/* Has a disk mapping */ +#define EROFS_MAP_MAPPED (1 << BH_Mapped) +/* Located in metadata (could be copied from bd_inode) */ +#define EROFS_MAP_META (1 << BH_Meta) +/* The extent has been compressed */ +#define EROFS_MAP_ZIPPED (1 << BH_Zipped) + +struct erofs_map_blocks { + erofs_off_t m_pa, m_la; + u64 m_plen, m_llen; + + unsigned int m_flags; +}; + +/* Flags used by erofs_map_blocks() */ +#define EROFS_GET_BLOCKS_RAW 0x0001 + +/* data.c */ +static inline struct bio * +erofs_grab_bio(struct super_block *sb, + erofs_blk_t blkaddr, unsigned int nr_pages, + bio_end_io_t endio, bool nofail) +{ + const gfp_t gfp = GFP_NOIO; + struct bio *bio; + + do { + if (nr_pages == 1) { + bio = bio_alloc(gfp | (nofail ? __GFP_NOFAIL : 0), 1); + if (unlikely(bio == NULL)) { + DBG_BUGON(nofail); + return ERR_PTR(-ENOMEM); + } + break; + } + bio = bio_alloc(gfp, nr_pages); + nr_pages /= 2; + } while (unlikely(bio == NULL)); + + bio->bi_end_io = endio; + bio_set_dev(bio, sb->s_bdev); +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0)) + bio->bi_sector = (sector_t)blkaddr << LOG_SECTORS_PER_BLOCK; +#else + bio->bi_iter.bi_sector = (sector_t)blkaddr << LOG_SECTORS_PER_BLOCK; +#endif + return bio; +} + +static inline void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags) +{ + bio_set_op_attrs(bio, op, op_flags); +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)) + submit_bio(0, bio); +#else + submit_bio(bio); +#endif +} + +#ifndef CONFIG_EROFS_FS_IO_MAX_RETRIES +#define EROFS_IO_MAX_RETRIES_NOFAIL 0 +#else +#define EROFS_IO_MAX_RETRIES_NOFAIL CONFIG_EROFS_FS_IO_MAX_RETRIES +#endif + +extern struct page *__erofs_get_meta_page(struct super_block *sb, + erofs_blk_t blkaddr, bool prio, bool nofail); + +static inline struct page *erofs_get_meta_page(struct super_block *sb, + erofs_blk_t blkaddr, bool prio) +{ + return __erofs_get_meta_page(sb, blkaddr, prio, false); +} + +static inline struct page *erofs_get_meta_page_nofail(struct super_block *sb, + erofs_blk_t blkaddr, bool prio) +{ + return __erofs_get_meta_page(sb, blkaddr, prio, true); +} + +extern int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int); +extern int erofs_map_blocks_iter(struct inode *, struct erofs_map_blocks *, + struct page **, int); + +struct erofs_map_blocks_iter { + struct erofs_map_blocks map; + struct page *mpage; +}; + + +static inline struct page * +erofs_get_inline_page(struct inode *inode, + erofs_blk_t blkaddr) +{ + return erofs_get_meta_page(inode->i_sb, + blkaddr, S_ISDIR(inode->i_mode)); +} + +/* inode.c */ +extern struct inode *erofs_iget(struct super_block *sb, + erofs_nid_t nid, bool dir); + +/* dir.c */ +int erofs_namei(struct inode *dir, struct qstr *name, + erofs_nid_t *nid, unsigned *d_type); + +/* xattr.c */ +#ifdef CONFIG_EROFS_FS_XATTR +extern const struct xattr_handler *erofs_xattr_handlers[]; +#endif + +/* symlink */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 2, 0)) +extern const struct inode_operations simple_symlink_inode_operations; +#endif + +#ifdef CONFIG_EROFS_FS_XATTR +extern const struct inode_operations erofs_symlink_xattr_iops; +extern const struct inode_operations erofs_fast_symlink_xattr_iops; +extern const struct inode_operations erofs_special_inode_operations; +#endif + +static inline void set_inode_fast_symlink(struct inode *inode) +{ +#ifdef CONFIG_EROFS_FS_XATTR + inode->i_op = &erofs_fast_symlink_xattr_iops; +#else + inode->i_op = &simple_symlink_inode_operations; +#endif +} + +static inline bool is_inode_fast_symlink(struct inode *inode) +{ +#ifdef CONFIG_EROFS_FS_XATTR + return inode->i_op == &erofs_fast_symlink_xattr_iops; +#else + return inode->i_op == &simple_symlink_inode_operations; +#endif +} + +static inline void *erofs_vmap(struct page **pages, unsigned int count) +{ +#ifdef CONFIG_EROFS_FS_USE_VM_MAP_RAM + int i = 0; + + while (1) { + void *addr = vm_map_ram(pages, count, -1, PAGE_KERNEL); + /* retry two more times (totally 3 times) */ + if (addr != NULL || ++i >= 3) + return addr; + vm_unmap_aliases(); + } + return NULL; +#else + return vmap(pages, count, VM_MAP, PAGE_KERNEL); +#endif +} + +static inline void erofs_vunmap(const void *mem, unsigned int count) +{ +#ifdef CONFIG_EROFS_FS_USE_VM_MAP_RAM + vm_unmap_ram(mem, count); +#else + vunmap(mem); +#endif +} + +static inline void erofs_pagedump(struct page *page, const char *__caller_func__) +{ + void *out; + + if (!page) + errln("%s, page (null)", __caller_func__); + else + errln("%s, page (%lx) %px count %x flags %lx mapping %px private %lx", + __caller_func__, page->index, page, page_count(page), + page->flags, page->mapping, page_private(page)); + + out = kmap_atomic(page); + print_hex_dump(KERN_ERR, "page data: ", DUMP_PREFIX_OFFSET, + 16, 1, out, PAGE_SIZE, true); + kunmap_atomic(out); +} + +#define PAGE_BUGON(condition, page) do { \ + if (unlikely(condition)) { \ + erofs_pagedump(page, __func__); \ + BUG(); \ + } \ +} while (0) + +/* utils.c */ +extern struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp); + +extern void erofs_register_super(struct super_block *sb); +extern void erofs_unregister_super(struct super_block *sb); + +extern unsigned long erofs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc); +extern unsigned long erofs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc); + +#endif + diff --git a/fs/erofs/lz4armv8/lz4accel.c b/fs/erofs/lz4armv8/lz4accel.c new file mode 100644 index 000000000000..8e77909208c8 --- /dev/null +++ b/fs/erofs/lz4armv8/lz4accel.c @@ -0,0 +1,34 @@ +#include "lz4accel.h" +#include + +int _lz4_decompress_asm(uint8_t **dst_ptr, uint8_t *dst_begin, + uint8_t *dst_end, const uint8_t **src_ptr, + const uint8_t *src_end); + +int _lz4_decompress_asm_noprfm(uint8_t **dst_ptr, uint8_t *dst_begin, + uint8_t *dst_end, const uint8_t **src_ptr, + const uint8_t *src_end); + + +int lz4_decompress_asm_select(uint8_t **dst_ptr, uint8_t *dst_begin, + uint8_t *dst_end, const uint8_t **src_ptr, + const uint8_t *src_end) { + const unsigned i = smp_processor_id(); + + switch(read_cpuid_part_number()) { + case ARM_CPU_PART_CORTEX_A53: + lz4_decompress_asm_fn[i] = _lz4_decompress_asm_noprfm; + return _lz4_decompress_asm_noprfm(dst_ptr, dst_begin, dst_end, + src_ptr, src_end); + } + lz4_decompress_asm_fn[i] = _lz4_decompress_asm; + return _lz4_decompress_asm(dst_ptr, dst_begin, dst_end, + src_ptr, src_end); +} + +int (*lz4_decompress_asm_fn[NR_CPUS])(uint8_t **dst_ptr, uint8_t *dst_begin, + uint8_t *dst_end, const uint8_t **src_ptr, const uint8_t *src_end) +__read_mostly = { + [0 ... NR_CPUS-1] = lz4_decompress_asm_select, +}; + diff --git a/fs/erofs/lz4armv8/lz4accel.h b/fs/erofs/lz4armv8/lz4accel.h new file mode 100644 index 000000000000..1b97a0245df9 --- /dev/null +++ b/fs/erofs/lz4armv8/lz4accel.h @@ -0,0 +1,54 @@ +#include +#include + +#if defined(CONFIG_ARM64) && defined(CONFIG_KERNEL_MODE_NEON) +#include +#include + +int _lz4_decompress_asm(uint8_t **dst_ptr, uint8_t *dst_begin, + uint8_t *dst_end, const uint8_t **src_ptr, + const uint8_t *src_end); + +int _lz4_decompress_asm_noprfm(uint8_t **dst_ptr, uint8_t *dst_begin, + uint8_t *dst_end, const uint8_t **src_ptr, + const uint8_t *src_end); + +static inline int lz4_decompress_accel_enable(void) +{ + return may_use_simd(); +} + +extern int (*lz4_decompress_asm_fn[])(uint8_t **dst_ptr, uint8_t *dst_begin, + uint8_t *dst_end, const uint8_t **src_ptr, + const uint8_t *src_end); + +static inline ssize_t lz4_decompress_asm( + uint8_t **dst_ptr, uint8_t *dst_begin, uint8_t *dst_end, + const uint8_t **src_ptr, const uint8_t *src_end) +{ + int ret; + + kernel_neon_begin(); + ret = lz4_decompress_asm_fn[smp_processor_id()](dst_ptr, dst_begin, + dst_end, src_ptr, + src_end); + kernel_neon_end(); + return (ssize_t)ret; +} + +#define __ARCH_HAS_LZ4_ACCELERATOR + +#else + +static inline int lz4_decompress_accel_enable(void) +{ + return 0; +} + +static inline ssize_t lz4_decompress_asm( + uint8_t **dst_ptr, uint8_t *dst_begin, uint8_t *dst_end, + const uint8_t **src_ptr, const uint8_t *src_end) +{ + return 0; +} +#endif diff --git a/fs/erofs/lz4armv8/lz4armv8.S b/fs/erofs/lz4armv8/lz4armv8.S new file mode 100644 index 000000000000..4882ee0f3924 --- /dev/null +++ b/fs/erofs/lz4armv8/lz4armv8.S @@ -0,0 +1,277 @@ +/* + * lz4armv8.S + * LZ4 decompression optimization based on arm64 NEON instruction + */ + +#include +#include + +/** + * _lz4_decompress_asm: The fast LZ4 decompression, lz4 decompression algothrim asm + * routine,support Huawei EROFS filesystem striving for maximum decompression speed. + * Entry point _lz4_decompress_asm. + * @para: + * x0 = current destination address ptr + * x1 = destination start position + * x2 = destination end position + * x3 = current source address ptr + * x4 = source end position + * @ret: + * 0 on success, -1 on failure + * + * x7: match_length + * x8: literal_legth + * x9: copy start ptr + * x10: copy end ptr + */ + + +#define match_length x7 +#define literal_length x8 +#define copy_from_ptr x9 /* copy source ptr*/ +#define copy_to_ptr x10 /* copy destination ptr*/ +#define w_tmp w11 /* temp var */ +#define tmp x11 +#define w_offset w12 +#define offset x12 +#define permtable_addr x13 +#define cplen_table_addr x14 +#define save_dst x15 +#define save_src x16 + + +/* x3 >= x4 src overflow */ +.macro check_src_overflow + cmp x3, x4 + b.hs Done +.endm + +/* x0 >= x2 dst overflow */ +.macro check_dst_overflow + cmp x0, x2 + b.hs Done +.endm + +.altmacro +.macro lz4_decompress_asm_generic doprfm=1 +LOCAL Lz4_decompress_begin, Decode_token, Get_literal_length +LOCAL Copy_long_literal_hs_15, Copy_long_literal_loop +LOCAL Copy_literal_lt_15, Decode_offset_matchlength +LOCAL Get_long_matchlength, Copy_match_begin +LOCAL Cond_offset_lt_matchlength, Copy_offset_lt_32 +LOCAL Copy_match_perm, Copy_offset_lt_32_loop +LOCAL Cond_offset_ge_matchlength, Copy_offset_ge_match_loop + stp x29, x30, [sp, #-16]! + mov x29, sp + stp x3, x0, [sp, #-16]! /* push src and dst in stack */ + ldr x3, [x3] /* x3 = *src_ptr */ + ldr x0, [x0] /* x0 = *dst_ptr */ + adr permtable_addr, Permtable + adr cplen_table_addr, Copylength_table + +Lz4_decompress_begin: + /* + * save current dst and src ,ensure when return from asm routine + * current both of "dst" and "src" save good position. + */ + mov save_dst, x0 + mov save_src, x3 + + check_dst_overflow + check_src_overflow + +.if \doprfm + add tmp, x0, #512 + cmp x2, tmp + b.ls Decode_token + prfm pstl2strm,[x0,#512] +.endif + + /* Decode Token Byte: */ +Decode_token: + ldrb w_tmp, [x3], #1 /* read Token Byte */ + lsr literal_length, tmp, #4 /* get literal_length */ + and match_length, tmp, #0xf /* get match_length */ + add match_length, match_length, #4 /* match_length >=4 */ + + /* + * literal_length <= 14 : no more literal length byte,fllowing zero + * or more bytes are liteal bytes. + */ + cmp literal_length, #14 + b.ls Copy_literal_lt_15 + + /* + * literal_length == 15 : more literal length bytes after TokenByte. + * continue decoding more literal length bytes. + */ +Get_literal_length: + check_src_overflow + ldrb w_tmp, [x3], #1 + add literal_length, literal_length, tmp + cmp tmp, #255 + b.eq Get_literal_length + +/* literal copy */ +Copy_long_literal_hs_15: + mov copy_from_ptr, x3 + mov copy_to_ptr, x0 + add x3, x3, literal_length + add x0, x0, literal_length + check_dst_overflow + check_src_overflow + +Copy_long_literal_loop: + ldr q0, [copy_from_ptr], #16 + str q0, [copy_to_ptr], #16 + + cmp x0, copy_to_ptr + b.ls Decode_offset_matchlength + b Copy_long_literal_loop + +Copy_literal_lt_15: + ldr q0, [x3] + str q0, [x0] + add x3, x3, literal_length + add x0, x0, literal_length + + /* Decode offset and match_length */ +Decode_offset_matchlength: + ldrh w_offset, [x3], #2 /* 2Byte:offset bytes */ + cbz offset, Failed /* match_length == 0 is invalid */ + sub copy_from_ptr, x0, offset + cmp copy_from_ptr, x1 + b.lo Failed + mov copy_to_ptr, x0 + /* + * set x0 to the end of "match copy"; + */ + add x0, x0, match_length + cmp match_length, #19 + b.lo Copy_match_begin + /* + * continue decoding more match length bytes. + */ +Get_long_matchlength: + check_src_overflow + ldrb w_tmp, [x3], #1 + add x0, x0, tmp + add match_length, match_length, tmp + cmp tmp, #255 + b.eq Get_long_matchlength + check_dst_overflow + /* + * here got the matchlength,start "match copy". + */ +Copy_match_begin: + cmp offset , match_length + b.hs Cond_offset_ge_matchlength + +Cond_offset_lt_matchlength: + cmp offset , #32 + b.hs Cond_offset_ge_matchlength + +Copy_offset_lt_32: + ldr q1, [copy_from_ptr] + add tmp, permtable_addr, offset, lsl #5 + ldp q2, q3, [tmp] + tbl v0.16b, {v1.16b}, v2.16b + tbl v1.16b, {v1.16b}, v3.16b + cmp offset , #16 + b.lo Copy_match_perm + ldp q0, q1, [copy_from_ptr] +Copy_match_perm: + ldrb w_tmp, [cplen_table_addr, offset] + stp q0, q1, [copy_to_ptr] + add copy_to_ptr, copy_to_ptr, tmp + cmp x0, copy_to_ptr + b.ls Lz4_decompress_begin +Copy_offset_lt_32_loop: + stp q0, q1, [copy_to_ptr] + add copy_to_ptr, copy_to_ptr, tmp + stp q0, q1, [copy_to_ptr] + add copy_to_ptr, copy_to_ptr, tmp + cmp x0, copy_to_ptr + b.hi Copy_offset_lt_32_loop + b Lz4_decompress_begin + +/* offset >= match */ +Cond_offset_ge_matchlength: + ldr q0, [copy_from_ptr], #16 + str q0, [copy_to_ptr], #16 + + cmp x0, copy_to_ptr + b.ls Lz4_decompress_begin +Copy_offset_ge_match_loop: + ldp q0, q1, [copy_from_ptr], #32 + stp q0, q1, [copy_to_ptr], #32 + + cmp x0, copy_to_ptr + b.hi Copy_offset_ge_match_loop + b Lz4_decompress_begin +.endm + +.text +.p2align 4 + +ENTRY(_lz4_decompress_asm) + lz4_decompress_asm_generic +ENDPROC(_lz4_decompress_asm) + +Failed: + mov tmp, #-1 + b Exit_here + +Done: + mov tmp, #0 + +Exit_here: + ldp x3, x0, [sp], #16 + str save_src, [x3] + str save_dst, [x0] + mov x0, tmp + ldp x29, x30, [sp], #16 + ret x30 + + +/* + * In case of offset <= 31 < matchlength ,expand the pattern and store in + * repeating pattern size(RPS),store the RPS in Copylength_table. + * case 1): 1 <= offset <= 15 + * expand the pattern according to the Permtable and store their repeating pattern in q0 q1; + * RPS = 32 - (32 % offset) offset <= 31 + * case 2): offset >= 16 + * read the pattern and store in q0 q1. + * RPS = offset. + */ +.text +.p2align 8 +Permtable: +.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 //offset = 0 +.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 //offset = 1 +.byte 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 //offset = 2 +.byte 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1 //offset = 3 +.byte 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 //offset = 4 +.byte 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1 //offset = 5 +.byte 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1 //offset = 6 +.byte 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3 //offset = 7 +.byte 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 //offset = 8 +.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4 //offset = 9 +.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1 //offset = 10 +.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 //offset = 11 +.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7 //offset = 12 +.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5 //offset = 13 +.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3 //offset = 14 +.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1 //offset = 15 + +.p2align 8 +Copylength_table: +.byte 32,32,32,30,32,30,30,28,32,27,30,22,24,26,28,30 // 0 .. 15 +.byte 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 // 16 .. 31 + + +.text +.p2align 4 +ENTRY(_lz4_decompress_asm_noprfm) + lz4_decompress_asm_generic 0 +ENDPROC(_lz4_decompress_asm_noprfm) diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c new file mode 100644 index 000000000000..b6c4cdaa75ad --- /dev/null +++ b/fs/erofs/namei.c @@ -0,0 +1,254 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/drivers/staging/erofs/namei.c + * + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ +#include "internal.h" +#include "xattr.h" + +#include + +/* based on the value of qn->len is accurate */ +static inline int dirnamecmp(struct qstr *qn, + struct qstr *qd, unsigned *matched) +{ + unsigned i = *matched, len = min(qn->len, qd->len); +loop: + if (unlikely(i >= len)) { + *matched = i; + if (qn->len < qd->len) { + /* + * actually (qn->len == qd->len) + * when qd->name[i] == '\0' + */ + return qd->name[i] == '\0' ? 0 : -1; + } + return (qn->len > qd->len); + } + + if (qn->name[i] != qd->name[i]) { + *matched = i; + return qn->name[i] > qd->name[i] ? 1 : -1; + } + + ++i; + goto loop; +} + +static struct erofs_dirent *find_target_dirent( + struct qstr *name, + u8 *data, int maxsize) +{ + unsigned ndirents, head, back; + unsigned startprfx, endprfx; + struct erofs_dirent *const de = (struct erofs_dirent *)data; + + /* make sure that maxsize is valid */ + BUG_ON(maxsize < sizeof(struct erofs_dirent)); + + ndirents = le16_to_cpu(de->nameoff) / sizeof(*de); + + /* corrupted dir (may be unnecessary...) */ + BUG_ON(!ndirents); + + head = 0; + back = ndirents - 1; + startprfx = endprfx = 0; + + while (head <= back) { + unsigned mid = head + (back - head) / 2; + unsigned nameoff = le16_to_cpu(de[mid].nameoff); + unsigned matched = min(startprfx, endprfx); + + struct qstr dname = QSTR_INIT(data + nameoff, + unlikely(mid >= ndirents - 1) ? + maxsize - nameoff : + le16_to_cpu(de[mid + 1].nameoff) - nameoff); + + /* string comparison without already matched prefix */ + int ret = dirnamecmp(name, &dname, &matched); + + if (unlikely(!ret)) + return de + mid; + else if (ret > 0) { + head = mid + 1; + startprfx = matched; + } else if (unlikely(mid < 1)) /* fix "mid" overflow */ + break; + else { + back = mid - 1; + endprfx = matched; + } + } + + return ERR_PTR(-ENOENT); +} + +static struct page *find_target_block_classic( + struct inode *dir, + struct qstr *name, int *_diff) +{ + unsigned startprfx, endprfx; + unsigned head, back; + struct address_space *const mapping = dir->i_mapping; + struct page *candidate = ERR_PTR(-ENOENT); + + startprfx = endprfx = 0; + head = 0; + back = inode_datablocks(dir) - 1; + + while (head <= back) { + unsigned mid = head + (back - head) / 2; + struct page *page = read_mapping_page(mapping, mid, NULL); + + if (IS_ERR(page)) { +exact_out: + if (!IS_ERR(candidate)) /* valid candidate */ + put_page(candidate); + return page; + } else { + int diff; + unsigned ndirents, matched; + struct qstr dname; + struct erofs_dirent *de = kmap_atomic(page); + unsigned nameoff = le16_to_cpu(de->nameoff); + + ndirents = nameoff / sizeof(*de); + + /* corrupted dir (should have one entry at least) */ + BUG_ON(!ndirents || nameoff > PAGE_SIZE); + + matched = min(startprfx, endprfx); + + dname.name = (u8 *)de + nameoff; + dname.len = ndirents == 1 ? + /* since the rest of the last page is 0 */ + EROFS_BLKSIZ - nameoff + : le16_to_cpu(de[1].nameoff) - nameoff; + + /* string comparison without already matched prefix */ + diff = dirnamecmp(name, &dname, &matched); + kunmap_atomic(de); + + if (unlikely(!diff)) { + *_diff = 0; + goto exact_out; + } else if (diff > 0) { + head = mid + 1; + startprfx = matched; + + if (likely(!IS_ERR(candidate))) + put_page(candidate); + candidate = page; + } else { + put_page(page); + + if (unlikely(mid < 1)) /* fix "mid" overflow */ + break; + + back = mid - 1; + endprfx = matched; + } + } + } + *_diff = 1; + return candidate; +} + +int erofs_namei(struct inode *dir, + struct qstr *name, + erofs_nid_t *nid, unsigned *d_type) +{ + int diff; + struct page *page; + u8 *data; + struct erofs_dirent *de; + + if (unlikely(!dir->i_size)) + return -ENOENT; + + diff = 1; + page = find_target_block_classic(dir, name, &diff); + + if (unlikely(IS_ERR(page))) + return PTR_ERR(page); + + data = kmap_atomic(page); + /* the target page has been mapped */ + de = likely(diff) ? + /* since the rest of the last page is 0 */ + find_target_dirent(name, data, EROFS_BLKSIZ) : + (struct erofs_dirent *)data; + + if (likely(!IS_ERR(de))) { + *nid = le64_to_cpu(de->nid); + *d_type = de->file_type; + } + + kunmap_atomic(data); + put_page(page); + + return PTR_ERR_OR_ZERO(de); +} + +/* NOTE: i_mutex is already held by vfs */ +static struct dentry *erofs_lookup(struct inode *dir, + struct dentry *dentry, unsigned int flags) +{ + int err; + erofs_nid_t nid; + unsigned d_type; + struct inode *inode; + + DBG_BUGON(!d_really_is_negative(dentry)); + /* dentry must be unhashed in lookup, no need to worry about */ + DBG_BUGON(!d_unhashed(dentry)); + + trace_erofs_lookup(dir, dentry, flags); + + /* file name exceeds fs limit */ + if (unlikely(dentry->d_name.len > EROFS_NAME_LEN)) + return ERR_PTR(-ENAMETOOLONG); + + /* false uninitialized warnings on gcc 4.8.x */ + err = erofs_namei(dir, &dentry->d_name, &nid, &d_type); + + if (err == -ENOENT) { + /* negative dentry */ + inode = NULL; + goto negative_out; + } else if (unlikely(err)) + return ERR_PTR(err); + + debugln("%s, %s (nid %llu) found, d_type %u", __func__, + dentry->d_name.name, nid, d_type); + + inode = erofs_iget(dir->i_sb, nid, d_type == EROFS_FT_DIR); + if (IS_ERR(inode)) + return ERR_CAST(inode); + +negative_out: + return d_splice_alias(inode, dentry); +} + +const struct inode_operations erofs_dir_iops = { + .lookup = erofs_lookup, +}; + +const struct inode_operations erofs_dir_xattr_iops = { + .lookup = erofs_lookup, +#ifdef CONFIG_EROFS_FS_XATTR +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 9, 0)) + .getxattr = generic_getxattr, +#endif + .listxattr = erofs_listxattr, +#endif +}; + diff --git a/fs/erofs/staging.h b/fs/erofs/staging.h new file mode 100644 index 000000000000..47c9708d295a --- /dev/null +++ b/fs/erofs/staging.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* should be avoid in the future */ +#include + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 31)) +__SETPAGEFLAG(Referenced, referenced) +#endif + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0)) +#define d_inode(d) ((d)->d_inode) +#endif + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 0)) +#define d_really_is_negative(d) (d_inode(d) == NULL) +#endif + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0)) +/* Restricts the given gfp_mask to what the mapping allows. */ +static inline gfp_t mapping_gfp_constraint( + struct address_space *mapping, + gfp_t gfp_mask) +{ + return mapping_gfp_mask(mapping) & gfp_mask; +} +#endif + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 116)) +static inline void inode_nohighmem(struct inode *inode) +{ + mapping_set_gfp_mask(inode->i_mapping, GFP_USER); +} +#endif + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)) + +/* bio stuffs */ +#define REQ_OP_READ READ +#define REQ_OP_WRITE WRITE +#define bio_op(bio) ((bio)->bi_rw & 1) + +static inline void bio_set_op_attrs(struct bio *bio, + unsigned op, unsigned op_flags) { + bio->bi_rw = op | op_flags; +} + +static inline gfp_t readahead_gfp_mask(struct address_space *x) +{ + return mapping_gfp_mask(x) | __GFP_COLD | + __GFP_NORETRY | __GFP_NOWARN; +} +#endif + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 13)) +#define READ_ONCE(x) ACCESS_ONCE(x) +#define WRITE_ONCE(x, val) (ACCESS_ONCE(x) = (val)) +#endif + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 40)) +static inline int lockref_put_return(struct lockref *lockref) +{ + return -1; +} +#endif + +#ifndef WQ_NON_REENTRANT +#define WQ_NON_REENTRANT 0 +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)) +#define page_cache_get(page) get_page(page) +#define page_cache_release(page) put_page(page) +#endif + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0)) +static inline bool sb_rdonly(const struct super_block *sb) { + return sb->s_flags & MS_RDONLY; +} + +#define bio_set_dev(bio, bdev) ((bio)->bi_bdev = (bdev)) + +#endif + +#ifndef lru_to_page +#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) +#endif + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0)) + +static inline void *kvmalloc(size_t size, gfp_t flags) +{ + void *buffer = NULL; + + if (size == 0) + return NULL; + + /* do not attempt kmalloc if we need more than 16 pages at once */ + if (size <= (16 * PAGE_SIZE)) + buffer = kmalloc(size, flags); + if (!buffer) { + if (flags & __GFP_ZERO) + buffer = vzalloc(size); + else + buffer = vmalloc(size); + } + return buffer; +} + +static inline void *kvzalloc(size_t size, gfp_t flags) +{ + return kvmalloc(size, flags | __GFP_ZERO); +} + +static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags) +{ + if (size != 0 && n > SIZE_MAX / size) + return NULL; + + return kvmalloc(n * size, flags); +} + +#endif + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0)) +static inline void kvfree(const void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} +#endif + diff --git a/fs/erofs/super.c b/fs/erofs/super.c new file mode 100644 index 000000000000..be24d82f4d1c --- /dev/null +++ b/fs/erofs/super.c @@ -0,0 +1,731 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/drivers/staging/erofs/super.c + * + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ +#include +#include +#include +#include +#include +#include "internal.h" +#ifdef CONFIG_MTK_BOOT +#include +#endif + +#define CREATE_TRACE_POINTS +#include + +static struct kmem_cache *erofs_inode_cachep __read_mostly; + +static void init_once(void *ptr) +{ + struct erofs_vnode *vi = ptr; + + inode_init_once(&vi->vfs_inode); +} + +static int erofs_init_inode_cache(void) +{ + erofs_inode_cachep = kmem_cache_create("erofs_inode", + sizeof(struct erofs_vnode), 0, + SLAB_RECLAIM_ACCOUNT, init_once); + + return erofs_inode_cachep != NULL ? 0 : -ENOMEM; +} + +static void erofs_exit_inode_cache(void) +{ + BUG_ON(erofs_inode_cachep == NULL); + kmem_cache_destroy(erofs_inode_cachep); +} + +static struct inode *alloc_inode(struct super_block *sb) +{ + struct erofs_vnode *vi = + kmem_cache_alloc(erofs_inode_cachep, GFP_KERNEL); + + if (vi == NULL) + return NULL; + + /* zero out everything except vfs_inode */ + memset(vi, 0, offsetof(struct erofs_vnode, vfs_inode)); + return &vi->vfs_inode; +} + +static void i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct erofs_vnode *vi = EROFS_V(inode); + + /* be careful RCU symlink path (see ext4_inode_info->i_data)! */ + if (is_inode_fast_symlink(inode)) +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 2, 0)) + kfree(vi->i_link); +#else + kfree(inode->i_link); +#endif + + kfree(vi->xattr_shared_xattrs); + + kmem_cache_free(erofs_inode_cachep, vi); +} + +static void destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, i_callback); +} + +static int superblock_read(struct super_block *sb) +{ + struct erofs_sb_info *sbi; + struct buffer_head *bh; + struct erofs_super_block *layout; + unsigned blkszbits; + int ret; + + bh = sb_bread(sb, 0); + + if (bh == NULL) { + errln("cannot read erofs superblock"); + return -EIO; + } + + sbi = EROFS_SB(sb); + layout = (struct erofs_super_block *)((u8 *)bh->b_data + + EROFS_SUPER_OFFSET); + + ret = -EINVAL; + if (le32_to_cpu(layout->magic) != EROFS_SUPER_MAGIC_V1) { + errln("cannot find valid erofs superblock"); + goto out; + } + + blkszbits = layout->blkszbits; + /* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */ + if (unlikely(blkszbits != LOG_BLOCK_SIZE)) { + errln("blksize %u isn't supported on this platform", + 1 << blkszbits); + goto out; + } + + sbi->blocks = le32_to_cpu(layout->blocks); + sbi->meta_blkaddr = le32_to_cpu(layout->meta_blkaddr); +#ifdef CONFIG_EROFS_FS_XATTR + sbi->xattr_blkaddr = le32_to_cpu(layout->xattr_blkaddr); +#endif + sbi->islotbits = ffs(sizeof(struct erofs_inode_v1)) - 1; +#ifdef CONFIG_EROFS_FS_ZIP + /* TODO: clusterbits should be related to inode */ + sbi->clusterbits = blkszbits; + + if (1 << (sbi->clusterbits - PAGE_SHIFT) > Z_EROFS_CLUSTER_MAX_PAGES) + errln("clusterbits %u is not supported on this kernel", + sbi->clusterbits); +#endif + + sbi->root_nid = le16_to_cpu(layout->root_nid); + sbi->inos = le64_to_cpu(layout->inos); + + sbi->build_time = le64_to_cpu(layout->build_time); + sbi->build_time_nsec = le32_to_cpu(layout->build_time_nsec); + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) + memcpy(sb->s_uuid, layout->uuid, sizeof(layout->uuid)); +#else + memcpy(&sb->s_uuid, layout->uuid, sizeof(layout->uuid)); +#endif + memcpy(sbi->volume_name, layout->volume_name, + sizeof(layout->volume_name)); + + ret = 0; +out: + brelse(bh); + return ret; +} + +#ifdef CONFIG_EROFS_FAULT_INJECTION +char *erofs_fault_name[FAULT_MAX] = { + [FAULT_KMALLOC] = "kmalloc", +}; + +static void erofs_build_fault_attr(struct erofs_sb_info *sbi, + unsigned int rate) +{ + struct erofs_fault_info *ffi = &sbi->fault_info; + + if (rate) { + atomic_set(&ffi->inject_ops, 0); + ffi->inject_rate = rate; + ffi->inject_type = (1 << FAULT_MAX) - 1; + } else { + memset(ffi, 0, sizeof(struct erofs_fault_info)); + } +} +#endif + +static void default_options(struct erofs_sb_info *sbi) +{ + /* set up some FS parameters */ +#ifdef CONFIG_EROFS_FS_ZIP + sbi->max_sync_decompress_pages = DEFAULT_MAX_SYNC_DECOMPRESS_PAGES; +#endif + +#ifdef CONFIG_EROFS_FS_XATTR + set_opt(sbi, XATTR_USER); +#endif + +#ifdef CONFIG_EROFS_FS_POSIX_ACL + set_opt(sbi, POSIX_ACL); +#endif + + set_opt(sbi, LZ4ASM); +} + +static bool force_disable_erofs = false; +enum { + Opt_user_xattr, + Opt_nouser_xattr, + Opt_acl, + Opt_noacl, + Opt_fault_injection, + Opt_lz4asm, + Opt_nolz4asm, + Opt_fmount, + Opt_err +}; + +static match_table_t erofs_tokens = { + {Opt_user_xattr, "user_xattr"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_fault_injection, "fault_injection=%u"}, + {Opt_lz4asm, "lz4asm"}, + {Opt_nolz4asm, "nolz4asm"}, + {Opt_fmount, "fmount"}, + {Opt_err, NULL} +}; + +static int parse_options(struct super_block *sb, char *options) +{ + bool force_panic_mount = false; + substring_t args[MAX_OPT_ARGS]; + char *p; + int arg = 0; + + if (!options) + return 0; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + + if (!*p) + continue; + + args[0].to = args[0].from = NULL; + token = match_token(p, erofs_tokens, args); + + switch (token) { +#ifdef CONFIG_EROFS_FS_XATTR + case Opt_user_xattr: + set_opt(EROFS_SB(sb), XATTR_USER); + break; + case Opt_nouser_xattr: + clear_opt(EROFS_SB(sb), XATTR_USER); + break; +#else + case Opt_user_xattr: + infoln("user_xattr options not supported"); + break; + case Opt_nouser_xattr: + infoln("nouser_xattr options not supported"); + break; +#endif +#ifdef CONFIG_EROFS_FS_POSIX_ACL + case Opt_acl: + set_opt(EROFS_SB(sb), POSIX_ACL); + break; + case Opt_noacl: + clear_opt(EROFS_SB(sb), POSIX_ACL); + break; +#else + case Opt_acl: + infoln("acl options not supported"); + break; + case Opt_noacl: + infoln("noacl options not supported"); + break; +#endif + case Opt_fault_injection: + if (args->from && match_int(args, &arg)) + return -EINVAL; +#ifdef CONFIG_EROFS_FAULT_INJECTION + erofs_build_fault_attr(EROFS_SB(sb), arg); + set_opt(EROFS_SB(sb), FAULT_INJECTION); +#else + infoln("FAULT_INJECTION was not selected"); +#endif + break; + case Opt_lz4asm: + set_opt(EROFS_SB(sb), LZ4ASM); + break; + case Opt_nolz4asm: + clear_opt(EROFS_SB(sb), LZ4ASM); + break; + case Opt_fmount: + force_panic_mount = true; + break; +#if 0 + default: + errln("Unrecognized mount option \"%s\" " + "or missing value", p); + return -EINVAL; +#endif + } + } + + if (force_disable_erofs && !force_panic_mount) { + pr_emerg("disable erofs mount due to panic in recovery"); + return -EINVAL; + } + + return 0; +} + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + +static const struct address_space_operations managed_cache_aops; +extern atomic_long_t erofs_global_shrink_cnt; + +static int managed_cache_releasepage(struct page *page, gfp_t gfp_mask) +{ + int ret = 1; /* 0 - busy */ + struct address_space *const mapping = page->mapping; + static unsigned long start_jiffies = 0; + unsigned long temp_jiffies; + + BUG_ON(!PageLocked(page)); + BUG_ON(mapping->a_ops != &managed_cache_aops); + + temp_jiffies = jiffies; + if (time_after(temp_jiffies, start_jiffies + 20 * HZ)) { + errln("%s, nr_cached_pages = %lu global_shrink_cnt = %lu", + __func__, READ_ONCE(mapping->nrpages), + atomic_long_read(&erofs_global_shrink_cnt)); + WRITE_ONCE(start_jiffies, temp_jiffies); + } + + if (PagePrivate(page)) + ret = erofs_try_to_free_cached_page(mapping, page); + + return ret; +} + +static void managed_cache_invalidatepage(struct page *page, + unsigned int offset, unsigned int length) +{ + const unsigned int stop = length + offset; + + BUG_ON(!PageLocked(page)); + + /* Check for overflow */ + BUG_ON(stop > PAGE_SIZE || stop < length); + + if (offset == 0 && stop == PAGE_SIZE) + while (!managed_cache_releasepage(page, GFP_NOFS)) + cond_resched(); +} + +static const struct address_space_operations managed_cache_aops = { + .releasepage = managed_cache_releasepage, + .invalidatepage = managed_cache_invalidatepage, +#ifdef CONFIG_MIGRATION + .migratepage = erofs_migrate_cached_page, +#endif +}; + +static struct inode *erofs_init_managed_cache(struct super_block *sb) +{ + struct inode *inode = new_inode(sb); + + if (unlikely(inode == NULL)) + return ERR_PTR(-ENOMEM); + + set_nlink(inode, 1); + inode->i_size = OFFSET_MAX; + + inode->i_mapping->a_ops = &managed_cache_aops; + mapping_set_gfp_mask(inode->i_mapping, + GFP_NOFS | __GFP_HIGHMEM | + __GFP_MOVABLE +#if defined(CONFIG_CMA) && defined(___GFP_CMA) + | ___GFP_CMA +#endif + ); + return inode; +} + +#endif + +static int erofs_read_super(struct super_block *sb, + const char *dev_name, void *data, int silent) +{ + struct inode *inode; + struct erofs_sb_info *sbi; + int err = -EINVAL; + + infoln("read_super, device -> %s", dev_name); + infoln("options -> %s", (char *)data); + + if (unlikely(!sb_set_blocksize(sb, EROFS_BLKSIZ))) { + errln("failed to set erofs blksize"); + goto err; + } + + sbi = kzalloc(sizeof(struct erofs_sb_info), GFP_KERNEL); + if (unlikely(sbi == NULL)) { + err = -ENOMEM; + goto err; + } + sb->s_fs_info = sbi; + + err = superblock_read(sb); + if (err) + goto err_sbread; + + sb->s_magic = EROFS_SUPER_MAGIC; + sb->s_flags |= MS_RDONLY | MS_NOATIME; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_time_gran = 1; + + sb->s_op = &erofs_sops; + +#ifdef CONFIG_EROFS_FS_XATTR + sb->s_xattr = erofs_xattr_handlers; +#endif + + /* set erofs default mount options */ + default_options(sbi); + + err = parse_options(sb, data); + if (err) + goto err_parseopt; + + if (!silent) + infoln("root inode @ nid %llu", ROOT_NID(sbi)); + +#ifdef CONFIG_EROFS_FS_ZIP + INIT_RADIX_TREE(&sbi->workstn.tree, GFP_ATOMIC); +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0)) + spin_lock_init(&sbi->workstn.lock); +#endif +#endif + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + sbi->managed_cache = erofs_init_managed_cache(sb); + if (IS_ERR(sbi->managed_cache)) { + err = PTR_ERR(sbi->managed_cache); + goto err_init_managed_cache; + } +#endif + + /* get the root inode */ + inode = erofs_iget(sb, ROOT_NID(sbi), true); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto err_iget; + } + + if (!S_ISDIR(inode->i_mode)) { + errln("rootino(nid %llu) is not a directory(i_mode %o)", + ROOT_NID(sbi), inode->i_mode); + err = -EINVAL; + goto err_isdir; + } + + sb->s_root = d_make_root(inode); + if (sb->s_root == NULL) { + err = -ENOMEM; + goto err_makeroot; + } + + /* save the device name to sbi */ + sbi->dev_name = __getname(); + if (sbi->dev_name == NULL) { + err = -ENOMEM; + goto err_devname; + } + + snprintf(sbi->dev_name, PATH_MAX, "%s", dev_name); + sbi->dev_name[PATH_MAX - 1] = '\0'; + + erofs_register_super(sb); + + /* + * We already have a positive dentry, which was instantiated + * by d_make_root. Just need to d_rehash it. + */ + d_rehash(sb->s_root); + + if (!silent) + infoln("mounted on %s with opts: %s.", dev_name, + (char *)data); + return 0; + /* + * please add a label for each exit point and use + * the following name convention, thus new features + * can be integrated easily without renaming labels. + */ +err_devname: + dput(sb->s_root); +err_makeroot: +err_isdir: + if (sb->s_root == NULL) + iput(inode); +err_iget: +#ifdef EROFS_FS_HAS_MANAGED_CACHE + iput(sbi->managed_cache); +err_init_managed_cache: +#endif +err_parseopt: +err_sbread: + sb->s_fs_info = NULL; + kfree(sbi); +err: + return err; +} + +/* + * could be triggered after deactivate_locked_super() + * is called, thus including umount and failed to initialize. + */ +static void erofs_put_super(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + + /* for cases which are failed in "read_super" */ + if (sbi == NULL) + return; + + WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC); + + infoln("unmounted for %s", sbi->dev_name); + __putname(sbi->dev_name); + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + iput(sbi->managed_cache); +#endif + + mutex_lock(&sbi->umount_mutex); + +#ifdef CONFIG_EROFS_FS_ZIP + erofs_workstation_cleanup_all(sb); +#endif + + erofs_unregister_super(sb); + mutex_unlock(&sbi->umount_mutex); + + kfree(sbi); + sb->s_fs_info = NULL; +} + + +struct erofs_mount_private { + const char *dev_name; + char *options; +}; + +/* support mount_bdev() with options */ +static int erofs_fill_super(struct super_block *sb, + void *_priv, int silent) +{ + struct erofs_mount_private *priv = _priv; + + return erofs_read_super(sb, priv->dev_name, + priv->options, silent); +} + +static struct dentry *erofs_mount( + struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + struct erofs_mount_private priv = { + .dev_name = dev_name, + .options = data + }; + + return mount_bdev(fs_type, flags, dev_name, + &priv, erofs_fill_super); +} + +static void erofs_kill_sb(struct super_block *sb) +{ + kill_block_super(sb); +} + +static struct shrinker erofs_shrinker_info = { + .scan_objects = erofs_shrink_scan, + .count_objects = erofs_shrink_count, + .seeks = DEFAULT_SEEKS, +}; + +static struct file_system_type erofs_fs_type = { + .owner = THIS_MODULE, + .name = "erofs", + .mount = erofs_mount, + .kill_sb = erofs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("erofs"); + +#ifdef CONFIG_EROFS_FS_ZIP +extern int z_erofs_init_zip_subsystem(void); +extern void z_erofs_exit_zip_subsystem(void); +#endif +unsigned int get_boot_into_recovery_flag(void); + +static int __init erofs_module_init(void) +{ + int err; + + erofs_check_ondisk_layout_definitions(); + infoln("initializing erofs " EROFS_VERSION); + + err = erofs_init_inode_cache(); + if (err) + goto icache_err; + + err = register_shrinker(&erofs_shrinker_info); + if (err) + goto shrinker_err; + +#ifdef CONFIG_EROFS_FS_ZIP + err = z_erofs_init_zip_subsystem(); + if (err) + goto zip_err; +#endif + + err = register_filesystem(&erofs_fs_type); + if (err) + goto fs_err; + + infoln("successfully to initialize erofs"); +#ifdef CONFIG_HISI_CMDLINE_PARSE + if (get_boot_into_recovery_flag() && + strstr(saved_command_line, "reboot_reason=AP_S_PANIC")) + force_disable_erofs = true; +#endif + +#ifdef CONFIG_MTK_BOOT + if (get_boot_mode() == RECOVERY_BOOT && + strstr(saved_command_line, "reboot_reason=AP_S_PANIC")) + force_disable_erofs = true; +#endif + return 0; + +fs_err: +#ifdef CONFIG_EROFS_FS_ZIP + z_erofs_exit_zip_subsystem(); +zip_err: +#endif + unregister_shrinker(&erofs_shrinker_info); +shrinker_err: + erofs_exit_inode_cache(); +icache_err: + return err; +} + +static void __exit erofs_module_exit(void) +{ + unregister_filesystem(&erofs_fs_type); +#ifdef CONFIG_EROFS_FS_ZIP + z_erofs_exit_zip_subsystem(); +#endif + unregister_shrinker(&erofs_shrinker_info); + erofs_exit_inode_cache(); + infoln("successfully finalize erofs"); +} + +/* get filesystem statistics */ +static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct erofs_sb_info *sbi = EROFS_SB(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = sb->s_magic; + buf->f_bsize = EROFS_BLKSIZ; + buf->f_blocks = sbi->blocks; + buf->f_bfree = buf->f_bavail = 0; + + buf->f_files = ULLONG_MAX; + buf->f_ffree = ULLONG_MAX - sbi->inos; + + buf->f_namelen = EROFS_NAME_LEN; + + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + return 0; +} + +static int erofs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb); + +#ifdef CONFIG_EROFS_FS_XATTR + if (test_opt(sbi, XATTR_USER)) + seq_puts(seq, ",user_xattr"); + else + seq_puts(seq, ",nouser_xattr"); +#endif +#ifdef CONFIG_EROFS_FS_POSIX_ACL + if (test_opt(sbi, POSIX_ACL)) + seq_puts(seq, ",acl"); + else + seq_puts(seq, ",noacl"); +#endif +#ifdef CONFIG_EROFS_FAULT_INJECTION + if (test_opt(sbi, FAULT_INJECTION)) + seq_printf(seq, ",fault_injection=%u", + sbi->fault_info.inject_rate); +#endif + if (test_opt(sbi, LZ4ASM)) + seq_puts(seq, ",lz4asm"); + else + seq_puts(seq, ",nolz4asm"); + + return 0; +} + +static int erofs_remount(struct super_block *sb, int *flags, char *data) +{ + BUG_ON(!sb_rdonly(sb)); + + *flags |= MS_RDONLY; + return 0; +} + +const struct super_operations erofs_sops = { + .put_super = erofs_put_super, + .alloc_inode = alloc_inode, + .destroy_inode = destroy_inode, + .statfs = erofs_statfs, + .show_options = erofs_show_options, + .remount_fs = erofs_remount, +}; + +module_init(erofs_module_init); +module_exit(erofs_module_exit); + +MODULE_DESCRIPTION("Enhanced ROM File System"); +MODULE_AUTHOR("Gao Xiang, Yu Chao, Miao Xie, CONSUMER BG, HUAWEI Inc."); +MODULE_LICENSE("GPL"); + diff --git a/fs/erofs/unzip_lz4.c b/fs/erofs/unzip_lz4.c new file mode 100644 index 000000000000..a9ec8731ee1b --- /dev/null +++ b/fs/erofs/unzip_lz4.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/drivers/staging/erofs/unzip_lz4.c + * + * Copyright (C) 2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang + */ +#include "generic/lz4.h" +#include "lz4armv8/lz4accel.h" + +#define LZ4_FAST_MARGIN (128) + +static ssize_t __maybe_unused __lz4_decompress_safe_partial_trusted( + void *dest, + size_t outputSize, + const void *source, + size_t inputSize, + bool accel) +{ + uint8_t *dstPtr = dest; + const uint8_t *srcPtr = source; + ssize_t ret; + +#ifdef __ARCH_HAS_LZ4_ACCELERATOR + /* Go fast if we can, keeping away from the end of buffers */ + if (outputSize > LZ4_FAST_MARGIN && inputSize > LZ4_FAST_MARGIN && + accel && lz4_decompress_accel_enable()) { + ret = lz4_decompress_asm(&dstPtr, dest, + dest + outputSize - LZ4_FAST_MARGIN, + &srcPtr, + source + inputSize - LZ4_FAST_MARGIN); + if (ret) + return -1; + } +#endif + /* Finish in safe */ + return __lz4_decompress_safe_partial(dstPtr, srcPtr, dest, outputSize, + source, inputSize, true); +} + +static ssize_t __maybe_unused __lz4_decompress_safe_partial_untrusted( + void *dest, + size_t outputSize, + const void *source, + size_t inputSize, + bool accel) +{ + uint8_t *dstPtr = dest; + const uint8_t *srcPtr = source; + ssize_t ret; + +#ifdef __ARCH_HAS_LZ4_ACCELERATOR + /* Go fast if we can, keeping away from the end of buffers */ + if (outputSize > LZ4_FAST_MARGIN && inputSize > LZ4_FAST_MARGIN && + accel && lz4_decompress_accel_enable()) { + ret = lz4_decompress_asm(&dstPtr, dest, + dest + outputSize - LZ4_FAST_MARGIN, + &srcPtr, + source + inputSize - LZ4_FAST_MARGIN); + if (ret) + return -1; + } +#endif + /* Finish in safe */ + return __lz4_decompress_safe_partial(dstPtr, srcPtr, dest, outputSize, + source, inputSize, false); +} + +int z_erofs_unzip_lz4(void *in, void *out, size_t inlen, + size_t outlen, bool accel) +{ + ssize_t ret; + +#ifdef CONFIG_EROFS_FS_DEBUG + ret = __lz4_decompress_safe_partial_untrusted(out, outlen, in, inlen, accel); +#else + ret = __lz4_decompress_safe_partial_trusted(out, outlen, in, inlen, accel); +#endif + + if (ret >= 0) + return (int)ret; + + /* + * LZ4_decompress_safe will return an error code + * (< 0) if decompression failed + */ + errln("%s, failed to decompress, in[%p, %zu] outlen[%p, %zu]", + __func__, in, inlen, out, outlen); + WARN_ON(1); + print_hex_dump(KERN_DEBUG, "raw data [in]: ", DUMP_PREFIX_OFFSET, + 16, 1, in, inlen, true); + print_hex_dump(KERN_DEBUG, "raw data [out]: ", DUMP_PREFIX_OFFSET, + 16, 1, out, outlen, true); + return -EIO; +} + diff --git a/fs/erofs/unzip_pagevec.h b/fs/erofs/unzip_pagevec.h new file mode 100644 index 000000000000..0956615b86f7 --- /dev/null +++ b/fs/erofs/unzip_pagevec.h @@ -0,0 +1,172 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * linux/drivers/staging/erofs/unzip_pagevec.h + * + * Copyright (C) 2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ +#ifndef __EROFS_UNZIP_PAGEVEC_H +#define __EROFS_UNZIP_PAGEVEC_H + +#include + +/* page type in pagevec for unzip subsystem */ +enum z_erofs_page_type { + /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */ + Z_EROFS_PAGE_TYPE_EXCLUSIVE, + + Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED, + + Z_EROFS_VLE_PAGE_TYPE_HEAD, + Z_EROFS_VLE_PAGE_TYPE_MAX +}; + +extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0") + __bad_page_type_exclusive(void); + +/* pagevec tagged pointer */ +typedef tagptr2_t erofs_vtptr_t; + +/* pagevec collector */ +struct z_erofs_pagevec_ctor { + struct page *curr, *next; + erofs_vtptr_t *pages; + + unsigned int nr, index; +}; + +static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor, + bool atomic) +{ + if (ctor->curr == NULL) + return; + + if (atomic) + kunmap_atomic(ctor->pages); + else + kunmap(ctor->curr); +} + +static inline struct page * +z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor, + unsigned nr) +{ + unsigned index; + + /* keep away from occupied pages */ + if (ctor->next != NULL) + return ctor->next; + + for (index = 0; index < nr; ++index) { + const erofs_vtptr_t t = ctor->pages[index]; + const unsigned tags = tagptr_unfold_tags(t); + + if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE) + return tagptr_unfold_ptr(t); + } + + if (unlikely(nr >= ctor->nr)) + BUG(); + + return NULL; +} + +static inline void +z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor, + bool atomic) +{ + struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr); + + z_erofs_pagevec_ctor_exit(ctor, atomic); + + ctor->curr = next; + ctor->next = NULL; + ctor->pages = atomic ? + kmap_atomic(ctor->curr) : kmap(ctor->curr); + + ctor->nr = PAGE_SIZE / sizeof(struct page *); + ctor->index = 0; +} + +static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor, + unsigned nr, + erofs_vtptr_t *pages, unsigned i) +{ + ctor->nr = nr; + ctor->curr = ctor->next = NULL; + ctor->pages = pages; + + if (i >= nr) { + i -= nr; + z_erofs_pagevec_ctor_pagedown(ctor, false); + while (i > ctor->nr) { + i -= ctor->nr; + z_erofs_pagevec_ctor_pagedown(ctor, false); + } + } + + ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i); + ctor->index = i; +} + +static inline bool +z_erofs_pagevec_ctor_enqueue(struct z_erofs_pagevec_ctor *ctor, + struct page *page, + enum z_erofs_page_type type, + bool *occupied) +{ + *occupied = false; + if (unlikely(ctor->next == NULL && type)) + if (ctor->index + 1 == ctor->nr) + return false; + + if (unlikely(ctor->index >= ctor->nr)) + z_erofs_pagevec_ctor_pagedown(ctor, false); + + /* exclusive page type must be 0 */ + if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL) + __bad_page_type_exclusive(); + + /* should remind that collector->next never equal to 1, 2 */ + if (type == (uintptr_t)ctor->next) { + ctor->next = page; + *occupied = true; + } + + ctor->pages[ctor->index++] = + tagptr_fold(erofs_vtptr_t, page, type); + return true; +} + +static inline struct page * +z_erofs_pagevec_ctor_dequeue(struct z_erofs_pagevec_ctor *ctor, + enum z_erofs_page_type *type) +{ + erofs_vtptr_t t; + + if (unlikely(ctor->index >= ctor->nr)) { + BUG_ON(ctor->next == NULL); + z_erofs_pagevec_ctor_pagedown(ctor, true); + } + + t = ctor->pages[ctor->index]; + + *type = tagptr_unfold_tags(t); + + /* should remind that collector->next never equal to 1, 2 */ + if (*type == (uintptr_t)ctor->next) + ctor->next = tagptr_unfold_ptr(t); + + ctor->pages[ctor->index++] = + tagptr_fold(erofs_vtptr_t, NULL, 0); + + return tagptr_unfold_ptr(t); +} + +#endif + diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c new file mode 100644 index 000000000000..ba45d7c79afa --- /dev/null +++ b/fs/erofs/unzip_vle.c @@ -0,0 +1,2035 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/drivers/staging/erofs/unzip_vle.c + * + * Copyright (C) 2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ +#include "unzip_vle.h" +#include +#include + +#include + +#define PAGE_MIGRATE_LOCKED ((void *)0x5F10C10C) + +static struct workqueue_struct *z_erofs_workqueue __read_mostly; +static struct kmem_cache *z_erofs_workgroup_cachep __read_mostly; + +void z_erofs_exit_zip_subsystem(void) +{ + BUG_ON(z_erofs_workqueue == NULL); + BUG_ON(z_erofs_workgroup_cachep == NULL); + + destroy_workqueue(z_erofs_workqueue); + kmem_cache_destroy(z_erofs_workgroup_cachep); +} + +static inline int init_unzip_workqueue(void) +{ + const unsigned onlinecpus = num_possible_cpus(); + + /* + * we don't need too many threads, limiting threads + * could improve scheduling performance. + */ + z_erofs_workqueue = alloc_workqueue("erofs_unzipd", + WQ_UNBOUND | WQ_HIGHPRI | WQ_CPU_INTENSIVE, + onlinecpus + onlinecpus / 4); + + return z_erofs_workqueue != NULL ? 0 : -ENOMEM; +} + +int z_erofs_init_zip_subsystem(void) +{ + z_erofs_workgroup_cachep = + kmem_cache_create("erofs_compress", + Z_EROFS_WORKGROUP_SIZE, 0, + SLAB_RECLAIM_ACCOUNT, NULL); + + if (z_erofs_workgroup_cachep != NULL) { + if (!init_unzip_workqueue()) + return 0; + + kmem_cache_destroy(z_erofs_workgroup_cachep); + } + return -ENOMEM; +} + +enum z_erofs_vle_work_role { + Z_EROFS_VLE_WORK_SECONDARY, + Z_EROFS_VLE_WORK_PRIMARY, + Z_EROFS_VLE_WORK_PRIMARY_TERMINAL, + + /* + * The current work has at least been linked with the following + * processed chained works, which means if the processing page + * is the tail partial page of the work, the current work can + * safely use the whole page, as illustrated below: + * +--------------+-------------------------------------------+ + * | tail page | head page (of the previous work) | + * +--------------+-------------------------------------------+ + * /\ which belongs to the current work + * [ (*) this page can be used for the current work itself. ] + */ + Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED, + Z_EROFS_VLE_WORK_MAX +}; + +struct z_erofs_vle_work_builder { + enum z_erofs_vle_work_role role; + /* + * 'hosted = false' means that the current workgroup doesn't belong to + * the owned chained workgroups. In the other words, it is none of our + * business to submit this workgroup. + */ + bool hosted; + + struct z_erofs_vle_workgroup *grp; + struct z_erofs_vle_work *work; + struct z_erofs_pagevec_ctor vector; + + /* pages used for reading the compressed data */ + struct page **compressed_pages; + unsigned compressed_deficit; +}; + +#define VLE_WORK_BUILDER_INIT() \ + { .work = NULL, .role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED } + +#ifdef EROFS_FS_HAS_MANAGED_CACHE +static void z_erofs_vle_scan_cachepages(struct z_erofs_vle_work_builder *bl, + struct address_space *mapping, + pgoff_t index, + unsigned int clusterpages, + bool reserve_allocation, + struct list_head *pagepool) +{ + struct page **const compressed_pages = bl->compressed_pages; + const unsigned int compressed_deficit = bl->compressed_deficit; + bool standalone = true; + gfp_t gfp = mapping_gfp_constraint(mapping, ~__GFP_DIRECT_RECLAIM); + unsigned int i, j = 0; + + if (bl->role < Z_EROFS_VLE_WORK_PRIMARY_TERMINAL) + return; + + index += clusterpages - compressed_deficit; + + /* TODO: optimize by introducing find_get_pages_range */ + for (i = 0; i < compressed_deficit; ++i) { + struct page *page, *newpage = NULL; + z_erofs_ctptr_t v; + + if (READ_ONCE(compressed_pages[i]) != NULL) + continue; + + page = find_get_page(mapping, index + i); + if (page != NULL) + v = tagptr_fold(z_erofs_ctptr_t, page, 1); + else if (reserve_allocation) { +#if 1 + if (!list_empty(pagepool)) { + newpage = lru_to_page(pagepool); + list_del(&newpage->lru); + } else { + newpage = alloc_pages(gfp | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN, 0); + } + if (!newpage) + goto rrr; + newpage->mapping = Z_EROFS_MAPPING_PREALLOCATED; + v = tagptr_fold(z_erofs_ctptr_t, newpage, 1); +#else + v = tagptr_init(z_erofs_ctptr_t, + EROFS_UNALLOCATED_CACHED_PAGE); +#endif + } else { +rrr: + if (standalone) + j = i; + standalone = false; + continue; + } + + if (cmpxchg(&compressed_pages[i], + NULL, tagptr_cast_ptr(v)) == NULL) + continue; + + if (page != NULL) + put_page(page); + else if (newpage) { + newpage->mapping = NULL; + /* someone just allocated this page, drop our attempt */ + list_add(&newpage->lru, pagepool); + } + } + + bl->compressed_pages += j; + bl->compressed_deficit = compressed_deficit - j; + if (standalone) + bl->role = Z_EROFS_VLE_WORK_PRIMARY; +} + +/* called by erofs_shrinker to get rid of all compressed_pages */ +int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, + struct erofs_workgroup *egrp) +{ + struct z_erofs_vle_workgroup *const grp = + container_of(egrp, struct z_erofs_vle_workgroup, obj); + struct z_erofs_vle_work *const primary_work = + z_erofs_vle_grab_primary_work(grp); + struct address_space *const mapping = MNGD_MAPPING(sbi); + const int clusterpages = erofs_clusterpages(sbi); + int i; + + /* refcount of workgroup is now freezed as 1, check if it's in migration */ + if (!mutex_trylock(&primary_work->lock)) + return -EBUSY; + + /* + * refcount of workgroup is now freezed as 1, + * therefore no need to worry about available decompression users. + */ + for (i = 0; i < clusterpages; ++i) { + struct page *page = READ_ONCE(grp->compressed_pages[i]); + + if (page == NULL) + continue; + +#ifdef CONFIG_EROFS_FS_DEBUG + if (unlikely(page == PAGE_MIGRATE_LOCKED)) { + /* cannot be migrate locked */ + errln("%s: %d, mngd_mapping(%px) migrate_locked in grp %px", + __func__, __LINE__, mapping, grp); + + print_hex_dump(KERN_ERR, "grp data: ", DUMP_PREFIX_OFFSET, + 16, 1, grp, sizeof(struct z_erofs_vle_workgroup), true); + DBG_BUGON(1); + } +#endif + + /* block other users from reclaiming or migrating the page */ + if (!trylock_page(page)) { + mutex_unlock(&primary_work->lock); + return -EBUSY; + } + +#ifdef CONFIG_EROFS_FS_DEBUG + if (unlikely(page->mapping != mapping)) { + errln("%s: %d, page->mapping != mngd_mapping(%px) compressed_page %px in grp %px", + __func__, __LINE__, mapping, page, grp); + + print_hex_dump(KERN_ERR, "grp data: ", DUMP_PREFIX_OFFSET, + 16, 1, grp, sizeof(struct erofs_workgroup), true); + + print_hex_dump(KERN_ERR, "page data: ", DUMP_PREFIX_OFFSET, + 16, 1, page, sizeof(struct page), true); + + unlock_page(page); + continue; + } +#endif + + /* barrier is implied in the following 'unlock_page' */ + WRITE_ONCE(grp->compressed_pages[i], NULL); + + set_page_private(page, 0); + ClearPagePrivate(page); + + unlock_page(page); + put_page(page); + } + mutex_unlock(&primary_work->lock); + return 0; +} + +int erofs_try_to_free_cached_page(struct address_space *mapping, + struct page *page) +{ + struct erofs_sb_info *const sbi = EROFS_SB(mapping->host->i_sb); + const unsigned int clusterpages = erofs_clusterpages(sbi); + + struct z_erofs_vle_workgroup *grp; + int ret = 0; /* 0 - busy */ + + /* prevent the workgroup from being freed */ + rcu_read_lock(); + grp = (void *)page_private(page); + + if (erofs_workgroup_try_to_freeze(&grp->obj, 1)) { + unsigned int i; + + for (i = 0; i < clusterpages; ++i) { + if (grp->compressed_pages[i] == page) { + WRITE_ONCE(grp->compressed_pages[i], NULL); + ret = 1; + break; + } + } + +#ifdef CONFIG_EROFS_FS_DEBUG + if (unlikely(!ret)) { + errln("%s: %d, cannot found compressed_page %px in grp %px", + __func__, __LINE__, page, grp); + + print_hex_dump(KERN_ERR, "grp data: ", DUMP_PREFIX_OFFSET, + 16, 1, grp, sizeof(struct erofs_workgroup), true); + + print_hex_dump(KERN_ERR, "page data: ", DUMP_PREFIX_OFFSET, + 16, 1, page, sizeof(struct page), true); + } +#endif + + erofs_workgroup_unfreeze(&grp->obj, 1); + } + rcu_read_unlock(); + + if (ret) { + ClearPagePrivate(page); + put_page(page); + } + return ret; +} +#ifdef CONFIG_MIGRATION +int erofs_migrate_cached_page(struct address_space *mapping, + struct page *newpage, + struct page *page, + enum migrate_mode mode) +{ + struct erofs_sb_info *const sbi = EROFS_SB(mapping->host->i_sb); + const unsigned int clusterpages = erofs_clusterpages(sbi); + + struct z_erofs_vle_workgroup *grp; + struct z_erofs_vle_work *primary_work; + bool locking; + int rc; + unsigned int i; + + if (!PagePrivate(page)) + return migrate_page(mapping, newpage, page, mode); + + /* the workgroup will not be freed with compressed page locked */ + grp = (void *)READ_ONCE(page_private(page)); + DBG_BUGON(!grp); + + primary_work = z_erofs_vle_grab_primary_work(grp); + + if (!mutex_trylock(&primary_work->lock)) { + if (mode == MIGRATE_ASYNC) + return -EAGAIN; + + mutex_lock(&primary_work->lock); + } + + /* drop this migration attempt if freezed to 1 (reclaiming) */ + if (atomic_read(&grp->obj.refcount) == EROFS_LOCKED_MAGIC) { + mutex_unlock(&primary_work->lock); + return -EBUSY; + } + + rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + if (rc != MIGRATEPAGE_SUCCESS) { + mutex_unlock(&primary_work->lock); + return rc; + } + + locking = false; + for (i = 0; i < clusterpages; ++i) { + const struct page *victim = + cmpxchg(&grp->compressed_pages[i], page, newpage); + + if (victim == page) { + get_page(newpage); + set_page_private(newpage, (unsigned long)grp); + __SetPagePrivate(newpage); + break; + } + if (victim == PAGE_MIGRATE_LOCKED) + locking = true; + } + + if (i >= clusterpages) + DBG_BUGON(!locking); + else + locking = false; + + ClearPagePrivate(page); + set_page_private(page, 0); + + migrate_page_copy(newpage, page); + mutex_unlock(&primary_work->lock); + + if (!locking) + put_page(page); + return MIGRATEPAGE_SUCCESS; +} +#endif +#endif + +/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */ +static inline bool try_to_reuse_as_compressed_page( + struct z_erofs_vle_work_builder *b, + struct page *page) +{ + while (b->compressed_deficit) { + --b->compressed_deficit; + if (NULL == cmpxchg(b->compressed_pages++, NULL, page)) + return true; + } + + return false; +} + +/* callers must be with work->lock held */ +static int z_erofs_vle_work_add_page( + struct z_erofs_vle_work_builder *builder, + struct page *page, + enum z_erofs_page_type type) +{ + int ret; + bool occupied; + + /* give priority for the compressed data storage */ + if (builder->role >= Z_EROFS_VLE_WORK_PRIMARY && + type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && + try_to_reuse_as_compressed_page(builder, page)) + return 0; + + ret = z_erofs_pagevec_ctor_enqueue(&builder->vector, + page, type, &occupied); + builder->work->vcnt += (unsigned)ret; + + return ret ? 0 : -EAGAIN; +} + +static enum z_erofs_vle_work_role try_to_claim_workgroup( + struct z_erofs_vle_workgroup *grp, + z_erofs_vle_owned_workgrp_t *owned_head, + bool *hosted) +{ + DBG_BUGON(*hosted == true); + + /* let's claim these following types of workgroup */ +retry: + if (grp->next == Z_EROFS_VLE_WORKGRP_NIL) { + /* type 1, nil workgroup */ + if (Z_EROFS_VLE_WORKGRP_NIL != cmpxchg(&grp->next, + Z_EROFS_VLE_WORKGRP_NIL, *owned_head)) + goto retry; + + *owned_head = grp; + *hosted = true; + + /* lucky, I am the followee :) */ + return Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED; + } else if (grp->next == Z_EROFS_VLE_WORKGRP_TAIL) { + /* + * type 2, link to the end of a existing open chain, + * be careful that its submission itself is governed + * by the original owned chain. + */ + if (Z_EROFS_VLE_WORKGRP_TAIL != cmpxchg(&grp->next, + Z_EROFS_VLE_WORKGRP_TAIL, *owned_head)) + goto retry; + + *owned_head = Z_EROFS_VLE_WORKGRP_TAIL; + return Z_EROFS_VLE_WORK_PRIMARY_TERMINAL; + } + + /* :( better luck next time */ + return Z_EROFS_VLE_WORK_PRIMARY; +} + +struct z_erofs_vle_work_finder { + struct super_block *sb; + pgoff_t idx; + unsigned pageofs; + + struct z_erofs_vle_workgroup **grp_ret; + enum z_erofs_vle_work_role *role; + z_erofs_vle_owned_workgrp_t *owned_head; + bool *hosted; +}; + +static struct z_erofs_vle_work * +z_erofs_vle_work_lookup(const struct z_erofs_vle_work_finder *f) +{ + bool tag, primary; + struct erofs_workgroup *egrp; + struct z_erofs_vle_workgroup *grp; + struct z_erofs_vle_work *work; + + egrp = erofs_find_workgroup(f->sb, f->idx, &tag); + if (egrp == NULL) { + *f->grp_ret = NULL; + return NULL; + } + + grp = container_of(egrp, struct z_erofs_vle_workgroup, obj); + *f->grp_ret = grp; + + work = z_erofs_vle_grab_work(grp, f->pageofs); + /* if multiref is disabled, `primary' is always true */ + primary = true; + + DBG_BUGON(work->pageofs != f->pageofs); + + /* + * lock must be taken first to avoid grp->next == NIL between + * claiming workgroup and adding pages: + * grp->next != NIL + * grp->next = NIL + * mutex_unlock_all + * mutex_lock(&work->lock) + * add all pages to pagevec + * + * [correct locking case 1]: + * mutex_lock(grp->work[a]) + * ... + * mutex_lock(grp->work[b]) mutex_lock(grp->work[c]) + * ... *role = SECONDARY + * add all pages to pagevec + * ... + * mutex_unlock(grp->work[c]) + * mutex_lock(grp->work[c]) + * ... + * grp->next = NIL + * mutex_unlock_all + * + * [correct locking case 2]: + * mutex_lock(grp->work[b]) + * ... + * mutex_lock(grp->work[a]) + * ... + * mutex_lock(grp->work[c]) + * ... + * grp->next = NIL + * mutex_unlock_all + * mutex_lock(grp->work[a]) + * *role = PRIMARY_OWNER + * add all pages to pagevec + * ... + */ + mutex_lock(&work->lock); + + *f->hosted = false; + *f->role = !primary ? Z_EROFS_VLE_WORK_SECONDARY : + /* claim the workgroup if possible */ + try_to_claim_workgroup(grp,f->owned_head, f->hosted); + return work; +} + +static struct z_erofs_vle_work * +z_erofs_vle_work_register(const struct z_erofs_vle_work_finder *f, + struct erofs_map_blocks *map) +{ + bool gnew = false; + struct z_erofs_vle_workgroup *grp = *f->grp_ret; + struct z_erofs_vle_work *work; + + /* if multiref is disabled, grp should never be nullptr */ + BUG_ON(grp != NULL); + + /* no available workgroup, let's allocate one */ + grp = kmem_cache_zalloc(z_erofs_workgroup_cachep, GFP_NOFS); + if (unlikely(grp == NULL)) + return ERR_PTR(-ENOMEM); + + grp->obj.index = f->idx; + grp->llen = map->m_llen; + + z_erofs_vle_set_workgrp_fmt(grp, + (map->m_flags & EROFS_MAP_ZIPPED) ? + Z_EROFS_VLE_WORKGRP_FMT_LZ4 : + Z_EROFS_VLE_WORKGRP_FMT_PLAIN); + atomic_set(&grp->obj.refcount, 1); + + /* new workgrps have been claimed as type 1 */ + WRITE_ONCE(grp->next, *f->owned_head); + /* primary and followed work for all new workgrps */ + *f->role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED; + /* it should be submitted by ourselves */ + *f->hosted = true; + + gnew = true; + work = z_erofs_vle_grab_primary_work(grp); + work->pageofs = f->pageofs; + + mutex_init(&work->lock); + + /* lock all primary followed works before visible to others */ + if (unlikely(!mutex_trylock(&work->lock))) + BUG(); + + if (gnew) { + int err = erofs_register_workgroup(f->sb, &grp->obj, 0); + + if (err) { + mutex_unlock(&work->lock); + kmem_cache_free(z_erofs_workgroup_cachep, grp); + return ERR_PTR(-EAGAIN); + } + } + + *f->owned_head = *f->grp_ret = grp; + return work; +} + +#define builder_is_weak_followed(builder) \ + ((builder)->role >= Z_EROFS_VLE_WORK_PRIMARY_TERMINAL) + +#define builder_is_followed(builder) \ + ((builder)->role >= Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED) + +static int z_erofs_vle_work_iter_begin(struct z_erofs_vle_work_builder *builder, + struct super_block *sb, + struct erofs_map_blocks *map, + z_erofs_vle_owned_workgrp_t *owned_head) +{ + const unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb)); + struct z_erofs_vle_workgroup *grp; + const struct z_erofs_vle_work_finder finder = { + .sb = sb, + .idx = erofs_blknr(map->m_pa), + .pageofs = map->m_la & ~PAGE_MASK, + .grp_ret = &grp, + .role = &builder->role, + .owned_head = owned_head, + .hosted = &builder->hosted + }; + struct z_erofs_vle_work *work; + + DBG_BUGON(builder->work != NULL); + + /* must be Z_EROFS_WORK_TAIL or the next chained work */ + DBG_BUGON(*owned_head == Z_EROFS_VLE_WORKGRP_NIL); + DBG_BUGON(*owned_head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED); + + DBG_BUGON(erofs_blkoff(map->m_pa)); + +repeat: + work = z_erofs_vle_work_lookup(&finder); + if (work != NULL) { + unsigned int orig_llen; + + /* increase workgroup `llen' if needed */ + while ((orig_llen = READ_ONCE(grp->llen)) < map->m_llen && + orig_llen != cmpxchg_relaxed(&grp->llen, + orig_llen, map->m_llen)) + cpu_relax(); + goto got_it; + } + + work = z_erofs_vle_work_register(&finder, map); + if (unlikely(work == ERR_PTR(-EAGAIN))) + goto repeat; + + if (unlikely(IS_ERR(work))) + return PTR_ERR(work); +got_it: + z_erofs_pagevec_ctor_init(&builder->vector, + Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, work->vcnt); + + if (builder->role >= Z_EROFS_VLE_WORK_PRIMARY) { + /* enable possibly in-place decompression */ + builder->compressed_pages = grp->compressed_pages; + builder->compressed_deficit = clusterpages; + } else { + builder->compressed_pages = NULL; + builder->compressed_deficit = 0; + } + + builder->grp = grp; + builder->work = work; + return 0; +} + +/* + * keep in mind that no referenced workgroups will be freed + * only after a RCU grace period, so rcu_read_lock() could + * prevent a workgroup from being freed. + */ +static void z_erofs_rcu_callback(struct rcu_head *head) +{ + struct z_erofs_vle_work *work = container_of(head, + struct z_erofs_vle_work, rcu); + struct z_erofs_vle_workgroup *grp = + z_erofs_vle_work_workgroup(work, true); + + kmem_cache_free(z_erofs_workgroup_cachep, grp); +} + +void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) +{ + struct z_erofs_vle_workgroup *const vgrp = container_of(grp, + struct z_erofs_vle_workgroup, obj); + struct z_erofs_vle_work *const work = &vgrp->work; + + call_rcu(&work->rcu, z_erofs_rcu_callback); +} + +static void __z_erofs_vle_work_release(struct z_erofs_vle_workgroup *grp, + struct z_erofs_vle_work *work __maybe_unused) +{ + erofs_workgroup_put(&grp->obj); +} + +void z_erofs_vle_work_release(struct z_erofs_vle_work *work) +{ + struct z_erofs_vle_workgroup *grp = + z_erofs_vle_work_workgroup(work, true); + + __z_erofs_vle_work_release(grp, work); +} + +static inline bool +z_erofs_vle_work_iter_end(struct z_erofs_vle_work_builder *builder) +{ + struct z_erofs_vle_work *work = builder->work; + + if (work == NULL) + return false; + + z_erofs_pagevec_ctor_exit(&builder->vector, false); + mutex_unlock(&work->lock); + + /* + * if all pending pages are added, don't hold work reference + * any longer if the current work isn't hosted by ourselves. + */ + if (!builder->hosted) + __z_erofs_vle_work_release(builder->grp, work); + + builder->work = NULL; + builder->grp = NULL; + return true; +} + +static inline struct page *__stagingpage_alloc(struct list_head *pagepool, + gfp_t gfp) +{ + struct page *page = erofs_allocpage(pagepool, gfp); + + if (unlikely(page == NULL)) + return NULL; + + page->mapping = Z_EROFS_MAPPING_STAGING; + return page; +} + +struct z_erofs_vle_frontend { + struct inode *const inode; + + struct z_erofs_vle_work_builder builder; + struct erofs_map_blocks_iter m_iter; + + z_erofs_vle_owned_workgrp_t owned_head; + + bool initial; +#if (EROFS_FS_ZIP_CACHE_LVL >= 2) + erofs_off_t cachedzone_la; +#endif +}; + +#define VLE_FRONTEND_INIT(__i) { \ + .inode = __i, \ + .m_iter = { \ + { .m_llen = 0, .m_plen = 0 }, \ + .mpage = NULL \ + }, \ + .builder = VLE_WORK_BUILDER_INIT(), \ + .owned_head = Z_EROFS_VLE_WORKGRP_TAIL, \ + .initial = true, } + +static int z_erofs_do_read_page(struct z_erofs_vle_frontend *fe, + struct page *page, + struct list_head *page_pool) +{ + struct super_block *const sb = fe->inode->i_sb; + struct erofs_sb_info *const sbi __maybe_unused = EROFS_SB(sb); + struct erofs_map_blocks_iter *const m = &fe->m_iter; + struct erofs_map_blocks *const map = &m->map; + struct z_erofs_vle_work_builder *const builder = &fe->builder; + const loff_t offset = page_offset(page); + + bool tight = builder_is_weak_followed(builder); + struct z_erofs_vle_work *work = builder->work; + + enum z_erofs_page_type page_type; + unsigned cur, end, spiltted, index; + int err = 0; + + /* register locked file pages as online pages in pack */ + z_erofs_onlinepage_init(page); + + spiltted = 0; + end = PAGE_SIZE; +repeat: + cur = end - 1; + + /* lucky, within the range of the current map_blocks */ + if (offset + cur >= map->m_la && + offset + cur < map->m_la + map->m_llen) { + /* the work haven't exist (maybe due to allocation failure) */ + if (unlikely(!builder->work)) + goto rebegin_work; + goto hitted; + } + + /* go ahead the next map_blocks */ + debugln("%s: [out-of-range] pos %llu", __func__, offset + cur); + + if (z_erofs_vle_work_iter_end(builder)) + fe->initial = false; + + map->m_la = offset + cur; + map->m_llen = 0; + err = erofs_map_blocks_iter(fe->inode, map, &m->mpage, 0); + if (unlikely(err)) + goto err_out; + +rebegin_work: + if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED))) + goto hitted; + + DBG_BUGON(map->m_plen != 1U << sbi->clusterbits); + DBG_BUGON(erofs_blkoff(map->m_pa)); + + err = z_erofs_vle_work_iter_begin(builder, sb, map, &fe->owned_head); + if (unlikely(err)) + goto err_out; + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + z_erofs_vle_scan_cachepages(builder, MNGD_MAPPING(sbi), + erofs_blknr(map->m_pa), + erofs_blknr(map->m_plen), + /* compressed page caching selection strategy */ + fe->initial | (EROFS_FS_ZIP_CACHE_LVL >= 2 ? + map->m_la < fe->cachedzone_la : 0), page_pool); +#endif + + tight &= builder_is_weak_followed(builder); + work = builder->work; +hitted: + cur = end - min_t(unsigned, offset + end - map->m_la, end); + if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED))) { + zero_user_segment(page, cur, end); + goto next_part; + } + + /* let's derive page type */ + page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD : + (!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : + (tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : + Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED)); + + if (cur) + tight &= builder_is_followed(builder); +retry: + err = z_erofs_vle_work_add_page(builder, page, page_type); + /* should allocate an additional staging page for pagevec */ + if (err == -EAGAIN) { + struct page *const newpage = + __stagingpage_alloc(page_pool, GFP_NOFS); + + err = z_erofs_vle_work_add_page(builder, + newpage, Z_EROFS_PAGE_TYPE_EXCLUSIVE); + if (likely(!err)) + goto retry; + } + + if (unlikely(err)) + goto err_out; + + index = page->index - map->m_la / PAGE_SIZE; + + /* FIXME! avoid the last relundant fixup & endio */ + z_erofs_onlinepage_fixup(page, index, true); + + /* bump up the number of spiltted parts of a page */ + ++spiltted; + + if (unlikely(spiltted > 2)) { + errln("%s, bad spiltted on page %px nid %llu index %lu", + __func__, page, EROFS_V(fe->inode)->nid, page->index); + BUG(); + } + + /* also update nr_pages */ + work->nr_pages = max_t(pgoff_t, work->nr_pages, index + 1); +next_part: + /* can be used for verification */ + map->m_llen = offset + cur - map->m_la; + + end = cur; + if (end > 0) + goto repeat; + +out: + /* FIXME! avoid the last relundant fixup & endio */ + z_erofs_onlinepage_endio(page); + + debugln("%s, finish page: %pK spiltted: %u map->m_llen %llu", + __func__, page, spiltted, map->m_llen); + return err; + + /* if some error occurred while processing this page */ +err_out: + SetPageError(page); + goto out; +} + +static void z_erofs_vle_unzip_wq(struct work_struct *work); + +static void z_erofs_vle_unzip_kickoff(void *ptr, int bios) +{ + tagptr1_t t = tagptr_init(tagptr1_t, ptr); + struct z_erofs_vle_unzip_io *io = tagptr_unfold_ptr(t); + bool background = tagptr_unfold_tags(t); + + if (!background) { + unsigned long flags; + + spin_lock_irqsave(&io->u.wait.lock, flags); + if (!atomic_add_return(bios, &io->pending_bios)) + wake_up_locked(&io->u.wait); + spin_unlock_irqrestore(&io->u.wait.lock, flags); + return; + } + + if (!atomic_add_return(bios, &io->pending_bios)) { +#ifdef CONFIG_PREEMPT_COUNT + if (in_atomic() || irqs_disabled()) + queue_work(z_erofs_workqueue, &io->u.work); + else + z_erofs_vle_unzip_wq(&io->u.work); +#else + queue_work(z_erofs_workqueue, &io->u.work); +#endif + } +} + + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0)) +static inline void z_erofs_vle_read_endio(struct bio *bio, int err) +#else +static inline void z_erofs_vle_read_endio(struct bio *bio) +#endif +{ +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0)) + const int err = bio->bi_status; +#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)) + const int err = bio->bi_error; +#endif + unsigned i; + struct bio_vec *bvec; + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + struct address_space *mngda = NULL; +#endif + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + bool cachemngd = false; + + DBG_BUGON(PageUptodate(page)); + BUG_ON(page->mapping == NULL); + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + if (unlikely(mngda == NULL && !z_erofs_is_stagingpage(page))) { + struct inode *const inode = page->mapping->host; + + mngda = MNGD_MAPPING(EROFS_I_SB(inode)); + } + + /* + * If mngda has not gotten, it equals NULL, + * however, page->mapping never be NULL if working properly. + */ + cachemngd = (page->mapping == mngda); +#endif + + if (unlikely(err)) + SetPageError(page); + else if (cachemngd) + SetPageUptodate(page); + + if (cachemngd) + unlock_page(page); + } + + z_erofs_vle_unzip_kickoff(bio->bi_private, -1); + bio_put(bio); +} + +static struct page *z_pagemap_global[Z_EROFS_VLE_VMAP_GLOBAL_PAGES]; +static DEFINE_MUTEX(z_pagemap_global_lock); + +static int z_erofs_vle_unzip(struct super_block *sb, + struct z_erofs_vle_workgroup *grp, + struct list_head *page_pool) +{ + struct erofs_sb_info *const sbi = EROFS_SB(sb); +#ifdef EROFS_FS_HAS_MANAGED_CACHE + struct address_space *const mngda = MNGD_MAPPING(sbi); +#endif + const unsigned clusterpages = erofs_clusterpages(sbi); + + struct z_erofs_pagevec_ctor ctor; + unsigned nr_pages; + unsigned sparsemem_pages = 0; + struct page *pages_onstack[Z_EROFS_VLE_VMAP_ONSTACK_PAGES]; + struct page **pages, **compressed_pages, *page; + unsigned i, llen; + + enum z_erofs_page_type page_type; + bool overlapped; + struct z_erofs_vle_work *work; + void *vout; + int err; + + might_sleep(); + work = z_erofs_vle_grab_primary_work(grp); + BUG_ON(!READ_ONCE(work->nr_pages)); + + mutex_lock(&work->lock); + nr_pages = work->nr_pages; + + if (likely(nr_pages <= Z_EROFS_VLE_VMAP_ONSTACK_PAGES)) + pages = pages_onstack; + else if (nr_pages <= Z_EROFS_VLE_VMAP_GLOBAL_PAGES && + mutex_trylock(&z_pagemap_global_lock)) + pages = z_pagemap_global; + else { +repeat: + pages = kvmalloc_array(nr_pages, + sizeof(struct page *), GFP_KERNEL); + + /* fallback to global pagemap for the lowmem scenario */ + if (unlikely(pages == NULL)) { + if (nr_pages > Z_EROFS_VLE_VMAP_GLOBAL_PAGES) + goto repeat; + else { + mutex_lock(&z_pagemap_global_lock); + pages = z_pagemap_global; + } + } + } + + for (i = 0; i < nr_pages; ++i) + pages[i] = NULL; + + z_erofs_pagevec_ctor_init(&ctor, + Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, 0); + + for (i = 0; i < work->vcnt; ++i) { + unsigned pagenr; + + page = z_erofs_pagevec_ctor_dequeue(&ctor, &page_type); + + /* all pages in pagevec ought to be valid */ + DBG_BUGON(page == NULL); + DBG_BUGON(page->mapping == NULL); + + if (z_erofs_gather_if_stagingpage(page_pool, page)) + continue; + + if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD) + pagenr = 0; + else + pagenr = z_erofs_onlinepage_index(page); + + BUG_ON(pagenr >= nr_pages); + BUG_ON(pages[pagenr] != NULL); + + pages[pagenr] = page; + } + sparsemem_pages = i; + + z_erofs_pagevec_ctor_exit(&ctor, true); + + overlapped = false; + compressed_pages = grp->compressed_pages; + + err = 0; + for (i = 0; i < clusterpages; ++i) { + unsigned pagenr; + + page = compressed_pages[i]; + + /* all compressed pages ought to be valid */ + DBG_BUGON(page == NULL); + DBG_BUGON(page->mapping == NULL); + + if (z_erofs_is_stagingpage(page)) + continue; +#ifdef EROFS_FS_HAS_MANAGED_CACHE + else if (page->mapping == mngda) { + if (unlikely(!PageUptodate(page))) { + /* PageError should be set in z_erofs_vle_read_endio */ + DBG_BUGON(!PageError(page)); + err = -EIO; + } + continue; + } +#endif + + /* only non-head page could be reused as a compressed page */ + pagenr = z_erofs_onlinepage_index2(page); + + BUG_ON(pagenr >= nr_pages); + BUG_ON(pages[pagenr] != NULL); + ++sparsemem_pages; + pages[pagenr] = page; + + overlapped = true; + } + + if (err) + goto out; + + llen = (nr_pages << PAGE_SHIFT) - work->pageofs; + + if (z_erofs_vle_workgrp_fmt(grp) == Z_EROFS_VLE_WORKGRP_FMT_PLAIN) { + /* FIXME! this should be fixed in the future */ + BUG_ON(grp->llen != llen); + + err = z_erofs_vle_plain_copy(compressed_pages, clusterpages, + pages, nr_pages, work->pageofs); + goto out; + } + + if (llen > grp->llen) + llen = grp->llen; + + err = z_erofs_vle_unzip_fast_percpu(compressed_pages, + clusterpages, pages, llen, work->pageofs, + test_opt(sbi, LZ4ASM)); + if (err != -ENOTSUPP) + goto out; + + if (sparsemem_pages >= nr_pages) { + BUG_ON(sparsemem_pages > nr_pages); + goto skip_allocpage; + } + + for (i = 0; i < nr_pages; ++i) { + if (pages[i] != NULL) + continue; + + pages[i] = __stagingpage_alloc(page_pool, GFP_NOFS +//#if defined(CONFIG_CMA) && defined(___GFP_CMA) +// | ___GFP_CMA +//#endif + ); + } + +skip_allocpage: + vout = erofs_vmap(pages, nr_pages); + if (!vout) { + err = -ENOMEM; + goto out; + } + + err = z_erofs_vle_unzip_vmap(compressed_pages, + clusterpages, vout, llen, + work->pageofs, overlapped, test_opt(sbi, LZ4ASM)); + + erofs_vunmap(vout, nr_pages); + +out: + /* must handle all compressed pages before endding pages */ + for (i = 0; i < clusterpages; ++i) { + page = compressed_pages[i]; + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + if (page->mapping == mngda) + continue; +#endif + /* recycle all individual staging pages */ + (void)z_erofs_gather_if_stagingpage(page_pool, page); + + WRITE_ONCE(compressed_pages[i], NULL); + } + + for (i = 0; i < nr_pages; ++i) { + page = pages[i]; + + if (!page) + continue; + + DBG_BUGON(page->mapping == NULL); + + /* recycle all individual staging pages */ + if (z_erofs_gather_if_stagingpage(page_pool, page)) + continue; + + if (unlikely(err < 0)) + SetPageError(page); + + z_erofs_onlinepage_endio(page); + } + + if (pages == z_pagemap_global) + mutex_unlock(&z_pagemap_global_lock); + else if (unlikely(pages != pages_onstack)) + kvfree(pages); + + work->nr_pages = 0; + work->vcnt = 0; + + /* all work locks MUST be taken before the following line */ + + WRITE_ONCE(grp->next, Z_EROFS_VLE_WORKGRP_NIL); + + /* all work locks SHOULD be released right now */ + mutex_unlock(&work->lock); + + z_erofs_vle_work_release(work); + return err; +} + +static void z_erofs_vle_unzip_all(struct super_block *sb, + struct z_erofs_vle_unzip_io *io, + struct list_head *page_pool) +{ + z_erofs_vle_owned_workgrp_t owned = io->head; + + while (owned != Z_EROFS_VLE_WORKGRP_TAIL_CLOSED) { + struct z_erofs_vle_workgroup *grp; + + /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ + DBG_BUGON(owned == Z_EROFS_VLE_WORKGRP_TAIL); + + /* no possible that 'owned' equals NULL */ + DBG_BUGON(owned == Z_EROFS_VLE_WORKGRP_NIL); + + grp = owned; + owned = READ_ONCE(grp->next); + + z_erofs_vle_unzip(sb, grp, page_pool); + } +} + +static void z_erofs_vle_unzip_wq(struct work_struct *work) +{ + struct z_erofs_vle_unzip_io_sb *iosb = container_of(work, + struct z_erofs_vle_unzip_io_sb, io.u.work); + LIST_HEAD(page_pool); + + BUG_ON(iosb->io.head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED); + z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &page_pool); + + put_pages_list(&page_pool); + kvfree(iosb); +} + +static inline struct z_erofs_vle_unzip_io * +prepare_io_handler(struct super_block *sb, + struct z_erofs_vle_unzip_io *io, + bool background) +{ + struct z_erofs_vle_unzip_io_sb *iosb; + + if (!background) { + /* waitqueue available for foreground io */ + BUG_ON(io == NULL); + + init_waitqueue_head(&io->u.wait); + atomic_set(&io->pending_bios, 0); + goto out; + } + + if (io != NULL) + BUG(); + else { + /* allocate extra io descriptor for background io */ + iosb = kvzalloc(sizeof(struct z_erofs_vle_unzip_io_sb), + GFP_KERNEL | __GFP_NOFAIL); + BUG_ON(iosb == NULL); + + io = &iosb->io; + } + + iosb->sb = sb; + INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq); +out: + io->head = Z_EROFS_VLE_WORKGRP_TAIL_CLOSED; + return io; +} + +static struct page * +z_erofs_workgrp_grab_page_for_submission(struct z_erofs_vle_workgroup *grp, + pgoff_t first_index, + unsigned int nr, + struct list_head *pagepool, + gfp_t gfp, + struct address_space *mc) +{ + struct address_space *mapping; + struct page *oldpage, *page; + bool tocache = false; + z_erofs_ctptr_t t; + int justfound; + +repeat: + page = xchg(&grp->compressed_pages[nr], PAGE_MIGRATE_LOCKED); + oldpage = PAGE_MIGRATE_LOCKED; + + if (page == NULL) + goto out_allocpage; + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + if (page == EROFS_UNALLOCATED_CACHED_PAGE) { + tocache = true; + goto out_allocpage; + } +#endif + + /* parse the compressed tagged pointer */ + t = tagptr_init(z_erofs_ctptr_t, page); + justfound = tagptr_unfold_tags(t); + page = tagptr_unfold_ptr(t); + + mapping = READ_ONCE(page->mapping); + +#ifndef EROFS_FS_HAS_MANAGED_CACHE + /* if managed cache is disabled, it is impossible `justfound' */ + DBG_BUGON(justfound); + + /* and it should be locked, not uptodate, and not truncated */ + DBG_BUGON(!PageLocked(page)); + DBG_BUGON(PageUptodate(page)); + DBG_BUGON(mapping == NULL); + + goto out; +#else + if (mapping == Z_EROFS_MAPPING_PREALLOCATED) { + WRITE_ONCE(grp->compressed_pages[nr], page); + goto out_add_to_managed_cache; + } + + + /* all unmanaged pages are locked, so it's impossible to be NULL */ + if (mapping != NULL && mapping != mc) { + WRITE_ONCE(grp->compressed_pages[nr], page); + /* ought to be unmanaged pages */ + goto out; + } + + lock_page(page); +#ifdef CONFIG_EROFS_FS_DEBUG + /* page reclaim went wrong, should never happen */ + if (unlikely(justfound && PagePrivate(page))) { + struct erofs_workgroup *ogrp; + + errln("%s: %d: page %px refcount %d grp %px (index %lu count %d) " + "page_private %lx", + __func__, __LINE__, page, page_count(page), grp, grp->obj.index, + atomic_read(&grp->obj.refcount), page_private(page)); + + print_hex_dump(KERN_ERR, "grp data: ", DUMP_PREFIX_OFFSET, + 16, 1, grp, sizeof(struct erofs_workgroup), true); + + rcu_read_lock(); + ogrp = (void *)page_private(page); + errln("%s: %d: page %px page_private %px", __func__, __LINE__, page, ogrp); + + print_hex_dump(KERN_ERR, "ogrp data: ", DUMP_PREFIX_OFFSET, + 16, 1, ogrp, sizeof(struct erofs_workgroup), true); + rcu_read_unlock(); + BUG(); + } +#endif + + if (page->mapping == mc) { + WRITE_ONCE(grp->compressed_pages[nr], page); + + if (!PagePrivate(page)) { + if (!justfound) + get_page(page); + justfound = 0; + set_page_private(page, (unsigned long)grp); + SetPagePrivate(page); + } + + if (PageUptodate(page)) { + unlock_page(page); + page = NULL; + } + goto out; + } + + /* for the truncation case (page locked) */ + DBG_BUGON(page->mapping != NULL); + DBG_BUGON(PagePrivate(page)); + + tocache = true; +#ifdef CONFIG_EROFS_FS_DEBUG + errln("%s: %d truncated page %px (count %d) grp %px (count %d)", + __func__, __LINE__, page, page_count(page), grp, atomic_read(&grp->obj.refcount)); +#endif + unlock_page(page); + put_page(page); + /* fallthrough */ +#endif +out_allocpage: + if (tocache) + gfp |= __GFP_MOVABLE +#if defined(CONFIG_CMA) && defined(___GFP_CMA) + | ___GFP_CMA +#endif + ; + + page = __stagingpage_alloc(pagepool, gfp); + if (oldpage != cmpxchg(&grp->compressed_pages[nr], oldpage, page)) { + list_add(&page->lru, pagepool); + cpu_relax(); + goto repeat; + } +#ifdef EROFS_FS_HAS_MANAGED_CACHE + if (!tocache) + goto out; +out_add_to_managed_cache: + if (add_to_page_cache_lru(page, mc, first_index + nr, gfp)) { +#ifdef CONFIG_EROFS_FS_DEBUG + errln("%s: %d add_to_page_cache_lru failed page %px (count %d) grp %px (count %d)", + __func__, __LINE__, page, page_count(page), grp, atomic_read(&grp->obj.refcount)); +#endif + page->mapping = Z_EROFS_MAPPING_STAGING; + goto out; + } + + set_page_private(page, (unsigned long)grp); + SetPagePrivate(page); +#endif +out: /* the only exit (for tracing and debugging) */ + return page; +} + +#ifdef EROFS_FS_HAS_MANAGED_CACHE +#define __FSIO_1 1 +#else +#define __FSIO_1 0 +#endif + +static bool z_erofs_vle_submit_all(struct super_block *sb, + z_erofs_vle_owned_workgrp_t owned_head, + struct list_head *pagepool, + struct z_erofs_vle_unzip_io *fg_io, + bool force_fg, + unsigned *io_submitted) +{ + struct erofs_sb_info *const sbi = EROFS_SB(sb); + const unsigned clusterpages = erofs_clusterpages(sbi); + const gfp_t gfp = GFP_NOFS; +#ifdef EROFS_FS_HAS_MANAGED_CACHE + struct address_space *const mc = MNGD_MAPPING(sbi); + struct z_erofs_vle_workgroup *lstgrp_noio = NULL, *lstgrp_io = NULL; +#endif + struct z_erofs_vle_unzip_io *ios[1 + __FSIO_1]; + struct bio *bio; + tagptr1_t bi_private; + /* since bio will be NULL, no need to initialize last_index */ + pgoff_t uninitialized_var(last_index); + bool force_submit = false; + unsigned nr_bios; + + if (unlikely(owned_head == Z_EROFS_VLE_WORKGRP_TAIL)) + return false; + + /* + * force_fg == 1, (io, fg_io[0]) no io, (io, fg_io[1]) need submit io + * force_fg == 0, (io, fg_io[0]) no io; (io[1], bg_io) need submit io + */ +#ifdef EROFS_FS_HAS_MANAGED_CACHE + ios[0] = prepare_io_handler(sb, fg_io + 0, false); +#endif + + if (force_fg) { + ios[__FSIO_1] = prepare_io_handler(sb, fg_io + __FSIO_1, false); + bi_private = tagptr_fold(tagptr1_t, ios[__FSIO_1], 0); + } else { + ios[__FSIO_1] = prepare_io_handler(sb, NULL, true); + bi_private = tagptr_fold(tagptr1_t, ios[__FSIO_1], 1); + } + + nr_bios = 0; + force_submit = false; + bio = NULL; + + /* by default, all need io submission */ + ios[__FSIO_1]->head = owned_head; + + do { + struct z_erofs_vle_workgroup *grp; + pgoff_t first_index; + struct page *page; + unsigned i = 0; + unsigned int noio = 0; + int err; + + /* no possible 'owned_head' equals the following */ + DBG_BUGON(owned_head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED); + DBG_BUGON(owned_head == Z_EROFS_VLE_WORKGRP_NIL); + + grp = owned_head; + + /* close the main owned chain at first */ + owned_head = cmpxchg(&grp->next, Z_EROFS_VLE_WORKGRP_TAIL, + Z_EROFS_VLE_WORKGRP_TAIL_CLOSED); + + first_index = grp->obj.index; + force_submit |= (first_index != last_index + 1); + + /* fulfill all compressed pages */ +repeat: + page = z_erofs_workgrp_grab_page_for_submission(grp, + first_index, i, pagepool, gfp, mc); + + if (page == NULL) { + force_submit = true; + ++noio; + goto skippage; + } + + if (bio != NULL && force_submit) { +submit_bio_retry: + __submit_bio(bio, REQ_OP_READ, 0); + bio = NULL; + } + + if (bio == NULL) { + bio = erofs_grab_bio(sb, first_index + i, + BIO_MAX_PAGES, z_erofs_vle_read_endio, true); + bio->bi_private = tagptr_cast_ptr(bi_private); + + ++nr_bios; + } + + err = bio_add_page(bio, page, PAGE_SIZE, 0); + if (err < PAGE_SIZE) + goto submit_bio_retry; + + force_submit = false; + last_index = first_index + i; +skippage: + if (++i < clusterpages) + goto repeat; + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + if (noio < clusterpages) { + lstgrp_io = grp; + } else { + z_erofs_vle_owned_workgrp_t iogrp_next = + owned_head == Z_EROFS_VLE_WORKGRP_TAIL ? + Z_EROFS_VLE_WORKGRP_TAIL_CLOSED : + owned_head; + + if (lstgrp_io == NULL) + ios[1]->head = iogrp_next; + else + WRITE_ONCE(lstgrp_io->next, iogrp_next); + + if (lstgrp_noio == NULL) + ios[0]->head = grp; + else + WRITE_ONCE(lstgrp_noio->next, grp); + + lstgrp_noio = grp; + } +#endif + } while (owned_head != Z_EROFS_VLE_WORKGRP_TAIL); + + if (bio != NULL) + __submit_bio(bio, REQ_OP_READ, 0); + + if (io_submitted) + *io_submitted = nr_bios; + +#ifndef EROFS_FS_HAS_MANAGED_CACHE + BUG_ON(!nr_bios); +#else + if (lstgrp_noio != NULL) + WRITE_ONCE(lstgrp_noio->next, Z_EROFS_VLE_WORKGRP_TAIL_CLOSED); + + if (!force_fg && !nr_bios) { + kvfree(container_of(ios[1], + struct z_erofs_vle_unzip_io_sb, io)); + return true; + } +#endif + + z_erofs_vle_unzip_kickoff(tagptr_cast_ptr(bi_private), nr_bios); + return true; +} + +static void z_erofs_submit_and_unzip(struct z_erofs_vle_frontend *f, + struct list_head *pagepool, + bool force_fg, + unsigned *io_submitted) +{ + struct super_block *sb = f->inode->i_sb; + struct z_erofs_vle_unzip_io io[1 + __FSIO_1]; + + if (!z_erofs_vle_submit_all(sb, f->owned_head, pagepool, io, force_fg, io_submitted)) + return; + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + z_erofs_vle_unzip_all(sb, &io[0], pagepool); +#endif + if (!force_fg) + return; + + /* wait until all bios are completed */ + io_wait_event(io[__FSIO_1].u.wait, + !atomic_read(&io[__FSIO_1].pending_bios)); + + /* let's synchronous decompression */ + z_erofs_vle_unzip_all(sb, &io[__FSIO_1], pagepool); +} + +static int z_erofs_vle_normalaccess_readpage(struct file *file, + struct page *page) +{ + struct inode *const inode = page->mapping->host; + struct z_erofs_vle_frontend f = VLE_FRONTEND_INIT(inode); + int err; + LIST_HEAD(pagepool); + + trace_erofs_readpage(page, false); + +#if (EROFS_FS_ZIP_CACHE_LVL >= 2) + f.cachedzone_la = page->index << PAGE_SHIFT; +#endif + err = z_erofs_do_read_page(&f, page, &pagepool); + (void)z_erofs_vle_work_iter_end(&f.builder); + + if (err) { + errln("%s, failed to read, err [%d]", __func__, err); + goto out; + } + + z_erofs_submit_and_unzip(&f, &pagepool, true, NULL); +out: + if (f.m_iter.mpage != NULL) + put_page(f.m_iter.mpage); + + /* clean up the remaining free pages */ + put_pages_list(&pagepool); + return 0; +} + +static int z_erofs_vle_normalaccess_readpages(struct file *filp, + struct address_space *mapping, + struct list_head *pages, + unsigned int nr_pages) +{ + struct inode *const inode = mapping->host; + struct block_device *bdev = inode->i_sb->s_bdev; + struct erofs_sb_info *const sbi = EROFS_I_SB(inode); + + bool sync = __should_decompress_synchronously(sbi, nr_pages); + struct z_erofs_vle_frontend f = VLE_FRONTEND_INIT(inode); + gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); + struct page *head = NULL; + LIST_HEAD(pagepool); + unsigned io_submitted = 0; + + trace_erofs_readpages(mapping->host, lru_to_page(pages), nr_pages, false); + + if (pages) { + /* + * Get one quota before read pages, when this ends, + * get the rest of quotas according to how many bios + * we submited in this routine. + */ + blk_throtl_get_quota(bdev, PAGE_SIZE, + msecs_to_jiffies(100), true); + } + +#if (EROFS_FS_ZIP_CACHE_LVL >= 2) + f.cachedzone_la = lru_to_page(pages)->index << PAGE_SHIFT; +#endif + for (; nr_pages; --nr_pages) { + struct page *page = lru_to_page(pages); + + prefetchw(&page->flags); + list_del(&page->lru); + + /* + * A pure asynchronous readahead is indicated if + * a PG_readahead marked page is hitted at first. + * Let's also do asynchronous decompression for this case. + */ + sync &= !(PageReadahead(page) && !head); /*lint !e514*/ + + if (add_to_page_cache_lru(page, mapping, page->index, gfp)) { + list_add(&page->lru, &pagepool); + continue; + } + + BUG_ON(PagePrivate(page)); + set_page_private(page, (unsigned long)head); + head = page; + } + + while (head != NULL) { + struct page *page = head; + int err; + + /* traversal in reverse order */ + head = (void *)page_private(page); + + err = z_erofs_do_read_page(&f, page, &pagepool); + if (err) { + struct erofs_vnode *vi = EROFS_V(inode); + + errln("%s, readahead error at page %lu of nid %llu", + __func__, page->index, vi->nid); + } + + put_page(page); + } + + (void)z_erofs_vle_work_iter_end(&f.builder); + + z_erofs_submit_and_unzip(&f, &pagepool, sync, &io_submitted); + + if (f.m_iter.mpage != NULL) + put_page(f.m_iter.mpage); + + /* clean up the remaining free pages */ + put_pages_list(&pagepool); + + if (io_submitted) + while (--io_submitted) + blk_throtl_get_quota(bdev, PAGE_SIZE, + msecs_to_jiffies(100), true); + return 0; +} + +const struct address_space_operations z_erofs_vle_normalaccess_aops = { + .readpage = z_erofs_vle_normalaccess_readpage, + .readpages = z_erofs_vle_normalaccess_readpages, +}; + +/* + * Variable-sized Logical Extent (Fixed Physical Cluster) Compression Mode + * --- + * VLE compression mode attempts to compress a number of logical data into + * a physical cluster with a fixed size. + * VLE compression mode uses "struct z_erofs_vle_decompressed_index". + */ +#define __vle_cluster_advise(x, bit, bits) \ + ((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1)) + +#define __vle_cluster_type(advise) __vle_cluster_advise(advise, \ + Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT, Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) + +#define vle_cluster_type(di) \ + __vle_cluster_type((di)->di_advise) + +#ifdef CONFIG_EROFS_FS_HUAWEI_EXTENSION + +#define vle_huawei_compat_previous_clusters(clustersize, di) (\ + (le16_to_cpu((di)->di_clusterofs) / clustersize) | \ +(__vle_cluster_advise((di)->di_advise, 4, 4) << 4)) + +#endif + +static int +vle_decompressed_index_clusterofs(unsigned int *clusterofs, + unsigned int clustersize, + struct z_erofs_vle_decompressed_index *di) +{ + switch (vle_cluster_type(di)) { + case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: + *clusterofs = clustersize; + break; +#ifdef CONFIG_EROFS_FS_HUAWEI_EXTENSION + case Z_EROFS_VLE_CLUSTER_TYPE_HUAWEI_COMPAT: + if (vle_huawei_compat_previous_clusters(clustersize, di)) { + *clusterofs = clustersize; + break; + } + /* fallthrough */ +#endif + case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + *clusterofs = le16_to_cpu(di->di_clusterofs); + break; + default: + DBG_BUGON(1); + return -EIO; + } + return 0; +} + +static inline erofs_blk_t +vle_extent_blkaddr(struct inode *inode, pgoff_t index) +{ + struct erofs_sb_info *sbi = EROFS_I_SB(inode); + struct erofs_vnode *vi = EROFS_V(inode); + + unsigned ofs = Z_EROFS_VLE_EXTENT_ALIGN(vi->inode_isize + + vi->xattr_isize) + sizeof(struct erofs_extent_header) + + index * sizeof(struct z_erofs_vle_decompressed_index); + + return erofs_blknr(iloc(sbi, vi->nid) + ofs); +} + +static inline unsigned int +vle_extent_blkoff(struct inode *inode, pgoff_t index) +{ + struct erofs_sb_info *sbi = EROFS_I_SB(inode); + struct erofs_vnode *vi = EROFS_V(inode); + + unsigned ofs = Z_EROFS_VLE_EXTENT_ALIGN(vi->inode_isize + + vi->xattr_isize) + sizeof(struct erofs_extent_header) + + index * sizeof(struct z_erofs_vle_decompressed_index); + + return erofs_blkoff(iloc(sbi, vi->nid) + ofs); +} + +struct vle_map_blocks_iter_ctx { + struct inode *inode; + struct super_block *sb; + unsigned int clusterbits; + + struct page **mpage_ret; + void **kaddr_ret; +}; + +static int +vle_get_logical_extent_head(const struct vle_map_blocks_iter_ctx *ctx, + unsigned int lcn, /* logical cluster number */ + unsigned long long *ofs, + erofs_blk_t *pblk, + unsigned int *flags) +{ + const unsigned int clustersize = 1 << ctx->clusterbits; + const erofs_blk_t mblk = vle_extent_blkaddr(ctx->inode, lcn); + struct page *mpage = *ctx->mpage_ret; /* extent metapage */ + + struct z_erofs_vle_decompressed_index *di; + unsigned int cluster_type, delta0; + + if (mpage->index != mblk) { + kunmap_atomic(*ctx->kaddr_ret); + unlock_page(mpage); + put_page(mpage); + + mpage = erofs_get_meta_page(ctx->sb, mblk, false); + if (IS_ERR(mpage)) { + *ctx->mpage_ret = NULL; + return PTR_ERR(mpage); + } + *ctx->mpage_ret = mpage; + *ctx->kaddr_ret = kmap_atomic(mpage); + } + + di = *ctx->kaddr_ret + vle_extent_blkoff(ctx->inode, lcn); + + cluster_type = vle_cluster_type(di); + switch (cluster_type) { + case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: + delta0 = le16_to_cpu(di->di_u.delta[0]); + if (unlikely(!delta0 || delta0 > lcn)) { + errln("invalid NONHEAD dl0 %u at lcn %u of nid %llu", + delta0, lcn, EROFS_V(ctx->inode)->nid); + DBG_BUGON(1); + return -EIO; + } + return vle_get_logical_extent_head(ctx, + lcn - delta0, ofs, pblk, flags); + case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: + *flags ^= EROFS_MAP_ZIPPED; + /* fallthrough */ +#ifdef CONFIG_EROFS_FS_HUAWEI_EXTENSION + case Z_EROFS_VLE_CLUSTER_TYPE_HUAWEI_COMPAT: + lcn -= vle_huawei_compat_previous_clusters(clustersize, di); + /* fallthrough */ +#endif + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + /* clustersize should be a power of two */ + *ofs = ((u64)lcn << ctx->clusterbits) + + (le16_to_cpu(di->di_clusterofs) & (clustersize - 1)); + *pblk = le32_to_cpu(di->di_u.blkaddr); + break; + default: + errln("unknown cluster type %u at lcn %u of nid %llu", + cluster_type, lcn, EROFS_V(ctx->inode)->nid); + DBG_BUGON(1); + return -EIO; + } + return 0; +} + +int z_erofs_map_blocks_iter(struct inode *inode, + struct erofs_map_blocks *map, + struct page **mpage_ret, int flags) +{ + void *kaddr; + const struct vle_map_blocks_iter_ctx ctx = { + .inode = inode, + .sb = inode->i_sb, + .clusterbits = EROFS_I_SB(inode)->clusterbits, + .mpage_ret = mpage_ret, + .kaddr_ret = &kaddr + }; + const unsigned int clustersize = 1 << ctx.clusterbits; + /* if both m_(l,p)len are 0, regularize l_lblk, l_lofs, etc... */ + const bool initial = !map->m_llen; + + /* logicial extent (start, end) offset */ + unsigned long long ofs, end; + unsigned int lcn; + u32 ofs_rem; + + erofs_blk_t mblk, pblk; + struct page *mpage = *mpage_ret; + struct z_erofs_vle_decompressed_index *di; + unsigned int cluster_type, logical_cluster_ofs; + int err = 0; + + trace_z_erofs_map_blocks_iter_enter(inode, map, flags); + + /* when trying to read beyond EOF, leave it unmapped */ + if (unlikely(map->m_la >= inode->i_size)) { + DBG_BUGON(!initial); + map->m_llen = map->m_la + 1 - inode->i_size; + map->m_la = inode->i_size; + map->m_flags = 0; + goto out; + } + + debugln("%s, m_la %llu m_llen %llu --- start", __func__, + map->m_la, map->m_llen); + + ofs = map->m_la + map->m_llen; + + /* clustersize should be power of two */ + lcn = ofs >> ctx.clusterbits; + ofs_rem = ofs & (clustersize - 1); + + mblk = vle_extent_blkaddr(inode, lcn); + + if (mpage == NULL || mpage->index != mblk) { + if (mpage != NULL) + put_page(mpage); + + mpage = erofs_get_meta_page(ctx.sb, mblk, false); + if (IS_ERR(mpage)) { + err = PTR_ERR(mpage); + goto out; + } + *mpage_ret = mpage; + } else { + lock_page(mpage); + DBG_BUGON(!PageUptodate(mpage)); + } + + kaddr = kmap_atomic(mpage); + di = kaddr + vle_extent_blkoff(inode, lcn); + + debugln("%s, lcn %u mblk %u e_blkoff %u", __func__, lcn, + mblk, vle_extent_blkoff(inode, lcn)); + + err = vle_decompressed_index_clusterofs(&logical_cluster_ofs, + clustersize, di); + if (unlikely(err)) + goto unmap_out; + + if (!initial) { + /* [walking mode] 'map' has been already initialized */ + map->m_llen += logical_cluster_ofs; + goto unmap_out; + } + + /* by default, compressed */ + map->m_flags |= EROFS_MAP_ZIPPED; + + end = ((u64)lcn + 1) * clustersize; + + cluster_type = vle_cluster_type(di); + + switch (cluster_type) { + case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: + if (ofs_rem >= logical_cluster_ofs) + map->m_flags ^= EROFS_MAP_ZIPPED; + /* fallthrough */ +#ifdef CONFIG_EROFS_FS_HUAWEI_EXTENSION + case Z_EROFS_VLE_CLUSTER_TYPE_HUAWEI_COMPAT: + if (vle_huawei_compat_previous_clusters(clustersize, di)) { + end = (lcn-- + 1ULL) * clustersize; + goto nonhead; + } + /* fallthrough */ +#endif + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + if (ofs_rem == logical_cluster_ofs) { + pblk = le32_to_cpu(di->di_u.blkaddr); + goto exact_hitted; + } + + if (ofs_rem > logical_cluster_ofs) { + ofs = (u64)lcn * clustersize | logical_cluster_ofs; + pblk = le32_to_cpu(di->di_u.blkaddr); + break; + } + + /* logical cluster number should be >= 1 */ + if (unlikely(!lcn)) { + errln("invalid logical cluster 0 at nid %llu", + EROFS_V(inode)->nid); + err = -EIO; + goto unmap_out; + } + end = ((u64)lcn-- * clustersize) | logical_cluster_ofs; + /* fallthrough */ + case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: +nonhead: /* get the correspoinding first chunk */ + err = vle_get_logical_extent_head(&ctx, lcn, &ofs, + &pblk, &map->m_flags); + mpage = *mpage_ret; + + if (unlikely(err)) { + if (mpage != NULL) + goto unmap_out; + goto out; + } + break; + default: + errln("unknown cluster type %u at offset %llu of nid %llu", + cluster_type, ofs, EROFS_V(inode)->nid); + err = -EIO; + goto unmap_out; + } + + map->m_la = ofs; +exact_hitted: + map->m_llen = end - ofs; + map->m_plen = clustersize; + map->m_pa = blknr_to_addr(pblk); + map->m_flags |= EROFS_MAP_MAPPED; +unmap_out: + kunmap_atomic(kaddr); + unlock_page(mpage); +out: + debugln("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o", + __func__, map->m_la, map->m_pa, + map->m_llen, map->m_plen, map->m_flags); + + trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err); + + /* aggressively BUG_ON iff CONFIG_EROFS_FS_DEBUG is on */ + DBG_BUGON(err < 0 && err != -ENOMEM); + return err; +} + diff --git a/fs/erofs/unzip_vle.h b/fs/erofs/unzip_vle.h new file mode 100644 index 000000000000..a280161c6661 --- /dev/null +++ b/fs/erofs/unzip_vle.h @@ -0,0 +1,247 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * linux/drivers/staging/erofs/unzip_vle.h + * + * Copyright (C) 2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ +#ifndef __EROFS_FS_UNZIP_VLE_H +#define __EROFS_FS_UNZIP_VLE_H + +#include "internal.h" +#include "unzip_pagevec.h" + +/* + * - 0x5A110C8D ('sallocated', Z_EROFS_MAPPING_STAGING) - + * used for temporary allocated pages (via erofs_allocpage), + * in order to seperate those from NULL mapping (eg. truncated pages) + */ +#define Z_EROFS_MAPPING_STAGING ((void *)0x5A110C8D) + +#define z_erofs_is_stagingpage(page) \ + ((page)->mapping == Z_EROFS_MAPPING_STAGING) + +static inline bool z_erofs_gather_if_stagingpage(struct list_head *page_pool, + struct page *page) +{ + if (z_erofs_is_stagingpage(page)) { + list_add(&page->lru, page_pool); + return true; + } + return false; +} + +/* + * - 0x6A110C8D ('pallocated', Z_EROFS_MAPPING_PREALLOCATED) - + * preallocated cached pages, will be added into managed cache later + */ +#define Z_EROFS_MAPPING_PREALLOCATED ((void *)0x6A110C8D) + +/* + * Structure fields follow one of the following exclusion rules. + * + * I: Modifiable by initialization/destruction paths and read-only + * for everyone else. + * + */ + +#define Z_EROFS_VLE_INLINE_PAGEVECS 3 + +struct z_erofs_vle_work { + struct mutex lock; + + /* I: decompression offset in page */ + unsigned short pageofs; + unsigned short nr_pages; + + /* L: queued pages in pagevec[] */ + unsigned vcnt; + + union { + /* L: pagevec */ + erofs_vtptr_t pagevec[Z_EROFS_VLE_INLINE_PAGEVECS]; + struct rcu_head rcu; + }; +}; + +#define Z_EROFS_VLE_WORKGRP_FMT_PLAIN 0 +#define Z_EROFS_VLE_WORKGRP_FMT_LZ4 1 +#define Z_EROFS_VLE_WORKGRP_FMT_MASK 1 + +typedef struct z_erofs_vle_workgroup *z_erofs_vle_owned_workgrp_t; + +/* compressed page tagged pointer (bit 0 - with an extra reference) */ +typedef tagptr1_t z_erofs_ctptr_t; + +struct z_erofs_vle_workgroup { + struct erofs_workgroup obj; + struct z_erofs_vle_work work; + + /* next owned workgroup */ + z_erofs_vle_owned_workgrp_t next; + + /* compressed pages (including multi-usage pages) */ + struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES]; + unsigned int llen, flags; +}; + +/* let's avoid the valid 32-bit kernel addresses */ + +/* the chained workgroup has't submitted io (still open) */ +#define Z_EROFS_VLE_WORKGRP_TAIL ((void *)0x5F0ECAFE) +/* the chained workgroup has already submitted io */ +#define Z_EROFS_VLE_WORKGRP_TAIL_CLOSED ((void *)0x5F0EDEAD) + +#define Z_EROFS_VLE_WORKGRP_NIL (NULL) + +#define z_erofs_vle_workgrp_fmt(grp) \ + ((grp)->flags & Z_EROFS_VLE_WORKGRP_FMT_MASK) + +static inline void z_erofs_vle_set_workgrp_fmt( + struct z_erofs_vle_workgroup *grp, + unsigned int fmt) +{ + grp->flags = fmt | (grp->flags & ~Z_EROFS_VLE_WORKGRP_FMT_MASK); +} + + +/* definitions if multiref is disabled */ +#define z_erofs_vle_grab_primary_work(grp) (&(grp)->work) +#define z_erofs_vle_grab_work(grp, pageofs) (&(grp)->work) +#define z_erofs_vle_work_workgroup(wrk, primary) \ + ((primary) ? container_of(wrk, \ + struct z_erofs_vle_workgroup, work) : \ + ({ BUG(); (void *)NULL; })) + + +#define Z_EROFS_WORKGROUP_SIZE sizeof(struct z_erofs_vle_workgroup) + +struct z_erofs_vle_unzip_io { + atomic_t pending_bios; + z_erofs_vle_owned_workgrp_t head; + + union { + wait_queue_head_t wait; + struct work_struct work; + } u; +}; + +struct z_erofs_vle_unzip_io_sb { + struct z_erofs_vle_unzip_io io; + struct super_block *sb; +}; + +#define Z_EROFS_ONLINEPAGE_COUNT_BITS 2 +#define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1) +#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS) + +/* + * waiters (aka. ongoing_packs): # to unlock the page + * sub-index: 0 - for partial page, >= 1 full page sub-index + */ +typedef atomic_t z_erofs_onlinepage_t; + +/* type punning */ +union z_erofs_onlinepage_converter { + z_erofs_onlinepage_t *o; + unsigned long *v; +}; + +static inline unsigned z_erofs_onlinepage_index2(struct page *page) +{ + union z_erofs_onlinepage_converter u; + + PAGE_BUGON(!PagePrivate(page), page); + u.v = &page_private(page); + + return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; +} + +static inline unsigned z_erofs_onlinepage_index(struct page *page) +{ + union z_erofs_onlinepage_converter u; + + PAGE_BUGON(!PagePrivate(page), page); + u.v = &page_private(page); + + return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; +} + +static inline void z_erofs_onlinepage_init(struct page *page) +{ + union { + z_erofs_onlinepage_t o; + unsigned long v; + /* keep from being unlocked in advance */ + } u = { .o = ATOMIC_INIT(1) }; + + set_page_private(page, u.v); + smp_wmb(); + SetPagePrivate(page); +} + +static inline void z_erofs_onlinepage_fixup(struct page *page, + uintptr_t index, bool down) +{ + unsigned long *p, o, v, id; +repeat: + p = &page_private(page); + o = READ_ONCE(*p); + + id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; + if (id) { + if (!index) + return; + + BUG_ON(id != index); + } + + v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) | + ((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned)down); + if (cmpxchg(p, o, v) != o) + goto repeat; +} + +static inline void z_erofs_onlinepage_endio(struct page *page) +{ + union z_erofs_onlinepage_converter u; + unsigned v; + + PAGE_BUGON(!PagePrivate(page), page); + u.v = &page_private(page); + + v = atomic_dec_return(u.o); + if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) { + ClearPagePrivate(page); + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); + } + + debugln("%s, page %p value %x", __func__, page, atomic_read(u.o)); +} + +#define Z_EROFS_VLE_VMAP_ONSTACK_PAGES \ + min_t(unsigned int, THREAD_SIZE / 8 / sizeof(struct page *), 96U) +#define Z_EROFS_VLE_VMAP_GLOBAL_PAGES 2048 + +/* unzip_vle_lz4.c */ +extern int z_erofs_vle_plain_copy(struct page **compressed_pages, + unsigned clusterpages, struct page **pages, + unsigned nr_pages, unsigned short pageofs); + +extern int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages, + unsigned clusterpages, struct page **pages, + unsigned outlen, unsigned short pageofs, bool accel); + +extern int z_erofs_vle_unzip_vmap(struct page **compressed_pages, + unsigned clusterpages, void *vaddr, unsigned llen, + unsigned short pageofs, bool overlapped, bool accel); + +#endif + diff --git a/fs/erofs/unzip_vle_lz4.c b/fs/erofs/unzip_vle_lz4.c new file mode 100644 index 000000000000..f3af8df8cf23 --- /dev/null +++ b/fs/erofs/unzip_vle_lz4.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/drivers/staging/erofs/unzip_vle_lz4.c + * + * Copyright (C) 2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ +#include "unzip_vle.h" + +#if Z_EROFS_CLUSTER_MAX_PAGES > Z_EROFS_VLE_INLINE_PAGEVECS +#define EROFS_PERCPU_NR_PAGES Z_EROFS_CLUSTER_MAX_PAGES +#else +#define EROFS_PERCPU_NR_PAGES Z_EROFS_VLE_INLINE_PAGEVECS +#endif + +static struct { + char data[PAGE_SIZE * EROFS_PERCPU_NR_PAGES]; +} erofs_pcpubuf[NR_CPUS]; + +int z_erofs_vle_plain_copy(struct page **compressed_pages, + unsigned clusterpages, + struct page **pages, + unsigned nr_pages, + unsigned short pageofs) +{ + unsigned i, j; + void *src = NULL; + const unsigned righthalf = PAGE_SIZE - pageofs; + char *percpu_data; + bool mirrored[Z_EROFS_CLUSTER_MAX_PAGES] = { 0 }; + + /* temp shortcut for pageofs = 0 */ + if (!pageofs && nr_pages == 1) { + if (compressed_pages[0] == pages[0]) + return 0; + if (pages[0]) { + copy_highpage(pages[0], compressed_pages[0]); + return 0; + } + } + + preempt_disable(); + percpu_data = erofs_pcpubuf[smp_processor_id()].data; + + j = 0; + for (i = 0; i < nr_pages; j = i++) { + struct page *page = pages[i]; + void *dst; + + if (page == NULL) { + if (src != NULL) { + if (!mirrored[j]) + kunmap_atomic(src); + src = NULL; + } + continue; + } + + dst = kmap_atomic(page); + + for (; j < clusterpages; ++j) { + if (compressed_pages[j] != page) + continue; + + BUG_ON(mirrored[j]); + memcpy(percpu_data + j * PAGE_SIZE, dst, PAGE_SIZE); + mirrored[j] = true; + break; + } + + if (i) { + if (src == NULL) + src = mirrored[i-1] ? + percpu_data + (i-1) * PAGE_SIZE : + kmap_atomic(compressed_pages[i-1]); + + memcpy(dst, src + righthalf, pageofs); + + if (!mirrored[i-1]) + kunmap_atomic(src); + + if (unlikely(i >= clusterpages)) { + kunmap_atomic(dst); + break; + } + } + + if (!righthalf) + src = NULL; + else { + src = mirrored[i] ? percpu_data + i * PAGE_SIZE : + kmap_atomic(compressed_pages[i]); + + memcpy(dst + pageofs, src, righthalf); + } + + kunmap_atomic(dst); + } + + if (src != NULL && !mirrored[j]) + kunmap_atomic(src); + + preempt_enable(); + return 0; +} + +extern int z_erofs_unzip_lz4(void *in, void *out, + size_t inlen, size_t outlen, bool accel); + +int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages, + unsigned clusterpages, + struct page **pages, + unsigned outlen, + unsigned short pageofs, + bool accel) +{ + void *vin, *vout; + unsigned nr_pages, i, j; + int ret; + + if (outlen + pageofs > EROFS_PERCPU_NR_PAGES * PAGE_SIZE) + return -ENOTSUPP; + + nr_pages = DIV_ROUND_UP(outlen + pageofs, PAGE_SIZE); + + if (clusterpages == 1) + vin = kmap_atomic(compressed_pages[0]); + else + vin = erofs_vmap(compressed_pages, clusterpages); + + preempt_disable(); + vout = erofs_pcpubuf[smp_processor_id()].data; + + ret = z_erofs_unzip_lz4(vin, vout + pageofs, + clusterpages * PAGE_SIZE, outlen, accel); + + if (ret < 0) + goto fail; + + ret = 0; + + for (i = 0; i < nr_pages; ++i) { + j = min((unsigned)PAGE_SIZE - pageofs, outlen); + + if (pages[i]) { + if (clusterpages == 1 && + pages[i] == compressed_pages[0]) { + memcpy(vin + pageofs, vout + pageofs, j); + } else { + void *dst = kmap_atomic(pages[i]); + + memcpy(dst + pageofs, vout + pageofs, j); + kunmap_atomic(dst); + } + } + vout += PAGE_SIZE; + outlen -= j; + pageofs = 0; + } + +fail: + preempt_enable(); + + if (clusterpages == 1) + kunmap_atomic(vin); + else + erofs_vunmap(vin, clusterpages); + + return ret; +} + +int z_erofs_vle_unzip_vmap(struct page **compressed_pages, + unsigned clusterpages, + void *vout, + unsigned llen, + unsigned short pageofs, + bool overlapped, + bool accel) +{ + void *vin; + unsigned i; + int ret; + + if (overlapped) { + preempt_disable(); + vin = erofs_pcpubuf[smp_processor_id()].data; + + for (i = 0; i < clusterpages; ++i) { + void *t = kmap_atomic(compressed_pages[i]); + + memcpy(vin + PAGE_SIZE *i, t, PAGE_SIZE); + kunmap_atomic(t); + } + } else if (clusterpages == 1) + vin = kmap_atomic(compressed_pages[0]); + else { + vin = erofs_vmap(compressed_pages, clusterpages); + } + + if (!vin) + return -ENOMEM; + + ret = z_erofs_unzip_lz4(vin, vout + pageofs, + clusterpages * PAGE_SIZE, llen, accel); + if (ret > 0) + ret = 0; + + if (!overlapped) { + if (clusterpages == 1) + kunmap_atomic(vin); + else { + erofs_vunmap(vin, clusterpages); + } + } else + preempt_enable(); + + return ret; +} + diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c new file mode 100644 index 000000000000..a691b96c0d80 --- /dev/null +++ b/fs/erofs/utils.c @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/drivers/staging/erofs/utils.c + * + * Copyright (C) 2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ + +#include "internal.h" +#include + +struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp) +{ + struct page *page; + + if (!list_empty(pool)) { + page = lru_to_page(pool); + list_del(&page->lru); + } else { + page = alloc_pages(gfp | __GFP_NOFAIL, 0); + + BUG_ON(page == NULL); + BUG_ON(page->mapping != NULL); + } + return page; +} + +/* global shrink count (for all mounted EROFS instances) */ +atomic_long_t erofs_global_shrink_cnt; + +#ifdef CONFIG_EROFS_FS_ZIP + +/* radix_tree and the future XArray both don't use tagptr_t yet */ +struct erofs_workgroup *erofs_find_workgroup( + struct super_block *sb, pgoff_t index, bool *tag) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_workgroup *grp; + int oldcount; + +repeat: + rcu_read_lock(); + grp = radix_tree_lookup(&sbi->workstn.tree, index); + if (grp != NULL) { + *tag = radix_tree_exceptional_entry(grp); + grp = (void *)((unsigned long)grp & + ~RADIX_TREE_EXCEPTIONAL_ENTRY); + + if (erofs_workgroup_get(grp, &oldcount)) { + /* prefer to relax rcu read side */ + rcu_read_unlock(); + goto repeat; + } + + /* decrease refcount added by erofs_workgroup_put */ + if (unlikely(oldcount == 1)) + atomic_long_dec(&erofs_global_shrink_cnt); + BUG_ON(index != grp->index); + } + rcu_read_unlock(); + return grp; +} + +int erofs_register_workgroup(struct super_block *sb, + struct erofs_workgroup *grp, + bool tag) +{ + struct erofs_sb_info *sbi; + int err; + + /* grp->refcount should not < 1 */ + DBG_BUGON(atomic_read(&grp->refcount) < 1); + + err = radix_tree_preload(GFP_NOFS); + if (err) + return err; + + sbi = EROFS_SB(sb); + erofs_workstn_lock(sbi); + + if (tag) + grp = (void *)((unsigned long)grp | + 1UL << RADIX_TREE_EXCEPTIONAL_SHIFT); + + /* + * If managed cache is enabled, the reclaim path assumes + * that the last reference count is used for its workstation. + * Therefore we should bump up reference count before + * making this workgroup visible to other users. + */ +#ifdef EROFS_FS_HAS_MANAGED_CACHE + /* refcount should be at least 2 to get on well with reclaim path */ + __erofs_workgroup_get(grp); +#endif + + err = radix_tree_insert(&sbi->workstn.tree, + grp->index, grp); + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + if (unlikely(err)) + /* it is safe to decrease for refcount >= 2 */ + atomic_dec(&grp->refcount); +#else + if (!err) + __erofs_workgroup_get(grp); +#endif + + erofs_workstn_unlock(sbi); + radix_tree_preload_end(); + return err; +} + +extern void erofs_workgroup_free_rcu(struct erofs_workgroup *grp); + +static void __erofs_workgroup_free(struct erofs_workgroup *grp) +{ + atomic_long_dec(&erofs_global_shrink_cnt); + erofs_workgroup_free_rcu(grp); +} + +int erofs_workgroup_put(struct erofs_workgroup *grp) +{ + int count = atomic_dec_return(&grp->refcount); + + if (count == 1) + atomic_long_inc(&erofs_global_shrink_cnt); + else if (!count) + __erofs_workgroup_free(grp); + return count; +} + +static void erofs_workgroup_unfreeze_final(struct erofs_workgroup *grp) +{ + erofs_workgroup_unfreeze(grp, 0); + __erofs_workgroup_free(grp); +} + +unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, + unsigned long nr_shrink, + bool cleanup) +{ + pgoff_t first_index = 0; + void *batch[PAGEVEC_SIZE]; + unsigned freed = 0; + + int i, found; +repeat: + erofs_workstn_lock(sbi); + + found = radix_tree_gang_lookup(&sbi->workstn.tree, + batch, first_index, PAGEVEC_SIZE); + + for (i = 0; i < found; ++i) { +#ifndef EROFS_FS_HAS_MANAGED_CACHE + int cnt; +#endif + struct erofs_workgroup *grp = (void *) + ((unsigned long)batch[i] & + ~RADIX_TREE_EXCEPTIONAL_ENTRY); + + first_index = grp->index + 1; + +#ifndef EROFS_FS_HAS_MANAGED_CACHE + cnt = atomic_read(&grp->refcount); + DBG_BUGON(cnt <= 0); + + if (cleanup) + DBG_BUGON(cnt != 1); + else if (cnt > 1) +#else + if (!erofs_workgroup_try_to_freeze(grp, 1)) +#endif + continue; + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + if (erofs_try_to_free_all_cached_pages(sbi, grp)) { +skip: + erofs_workgroup_unfreeze(grp, 1); + continue; + } + + if (radix_tree_delete(&sbi->workstn.tree, + grp->index) != grp) + goto skip; + + /* + * if managed cache is enable, the last refcount + * should indicate the related workstation. + */ + erofs_workgroup_unfreeze_final(grp); +#else + if (radix_tree_delete(&sbi->workstn.tree, + grp->index) != grp) + continue; + + /* (rarely) grabbed again when freeing */ + erofs_workgroup_put(grp); +#endif + ++freed; + if (unlikely(!--nr_shrink)) + break; + } + erofs_workstn_unlock(sbi); + + if (i && nr_shrink) + goto repeat; + return freed; +} + +#endif + +/* protected by 'erofs_sb_list_lock' */ +static unsigned int shrinker_run_no; + +/* protects the mounted 'erofs_sb_list' */ +static DEFINE_SPINLOCK(erofs_sb_list_lock); +static LIST_HEAD(erofs_sb_list); + +void erofs_register_super(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + + mutex_init(&sbi->umount_mutex); + + spin_lock(&erofs_sb_list_lock); + list_add(&sbi->list, &erofs_sb_list); + spin_unlock(&erofs_sb_list_lock); +} + +void erofs_unregister_super(struct super_block *sb) +{ + spin_lock(&erofs_sb_list_lock); + list_del(&EROFS_SB(sb)->list); + spin_unlock(&erofs_sb_list_lock); +} + +unsigned long erofs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + return atomic_long_read(&erofs_global_shrink_cnt); +} + +unsigned long erofs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct erofs_sb_info *sbi; + struct list_head *p; + + unsigned long nr = sc->nr_to_scan; + unsigned int run_no; + unsigned long freed = 0; + + spin_lock(&erofs_sb_list_lock); + do + run_no = ++shrinker_run_no; + while (run_no == 0); + + /* Iterate over all mounted superblocks and try to shrink them */ + p = erofs_sb_list.next; + while (p != &erofs_sb_list) { + sbi = list_entry(p, struct erofs_sb_info, list); + + /* + * We move the ones we do to the end of the list, so we stop + * when we see one we have already done. + */ + if (sbi->shrinker_run_no == run_no) + break; + + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + + spin_unlock(&erofs_sb_list_lock); + sbi->shrinker_run_no = run_no; + +#ifdef CONFIG_EROFS_FS_ZIP + freed += erofs_shrink_workstation(sbi, nr, false); +#endif + + spin_lock(&erofs_sb_list_lock); + /* Get the next list element before we move this one */ + p = p->next; + + /* + * Move this one to the end of the list to provide some + * fairness. + */ + list_move_tail(&sbi->list, &erofs_sb_list); + mutex_unlock(&sbi->umount_mutex); + + if (freed >= nr) + break; + } + spin_unlock(&erofs_sb_list_lock); + return freed; +} + diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c new file mode 100644 index 000000000000..9d92de5c364e --- /dev/null +++ b/fs/erofs/xattr.c @@ -0,0 +1,728 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/drivers/staging/erofs/xattr.c + * + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ +#include +#include "xattr.h" + +struct xattr_iter { + struct super_block *sb; + struct page *page; + void *kaddr; + + erofs_blk_t blkaddr; + unsigned ofs; +}; + +static inline void xattr_iter_end(struct xattr_iter *it, bool atomic) +{ + /* the only user of kunmap() is 'init_inode_xattrs' */ + if (unlikely(!atomic)) + kunmap(it->page); + else + kunmap_atomic(it->kaddr); + + unlock_page(it->page); + put_page(it->page); +} + +static inline void xattr_iter_end_final(struct xattr_iter *it) +{ + if (it->page == NULL) + return; + + xattr_iter_end(it, true); +} + +static int init_inode_xattrs(struct inode *inode) +{ + struct xattr_iter it; + unsigned i; + struct erofs_xattr_ibody_header *ih; + struct super_block *sb; + struct erofs_sb_info *sbi; + struct erofs_vnode *vi; + bool atomic_map; + + if (likely(inode_has_inited_xattr(inode))) + return 0; + + vi = EROFS_V(inode); + BUG_ON(!vi->xattr_isize); + + sb = inode->i_sb; + sbi = EROFS_SB(sb); + it.blkaddr = erofs_blknr(iloc(sbi, vi->nid) + vi->inode_isize); + it.ofs = erofs_blkoff(iloc(sbi, vi->nid) + vi->inode_isize); + + it.page = erofs_get_inline_page(inode, it.blkaddr); + if (IS_ERR(it.page)) + return PTR_ERR(it.page); + + /* read in shared xattr array (non-atomic, see kmalloc below) */ + it.kaddr = kmap(it.page); + atomic_map = false; + + ih = (struct erofs_xattr_ibody_header *)(it.kaddr + it.ofs); + + vi->xattr_shared_count = ih->h_shared_count; + vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count, + sizeof(uint), GFP_KERNEL); + if (vi->xattr_shared_xattrs == NULL) { + xattr_iter_end(&it, atomic_map); + return -ENOMEM; + } + + /* let's skip ibody header */ + it.ofs += sizeof(struct erofs_xattr_ibody_header); + + for (i = 0; i < vi->xattr_shared_count; ++i) { + if (unlikely(it.ofs >= EROFS_BLKSIZ)) { + /* cannot be unaligned */ + BUG_ON(it.ofs != EROFS_BLKSIZ); + xattr_iter_end(&it, atomic_map); + + it.page = erofs_get_meta_page(sb, + ++it.blkaddr, S_ISDIR(inode->i_mode)); + if (IS_ERR(it.page)) + return PTR_ERR(it.page); + + it.kaddr = kmap_atomic(it.page); + atomic_map = true; + it.ofs = 0; + } + vi->xattr_shared_xattrs[i] = + le32_to_cpu(*(__le32 *)(it.kaddr + it.ofs)); + it.ofs += sizeof(__le32); + } + xattr_iter_end(&it, atomic_map); + + inode_set_inited_xattr(inode); + return 0; +} + +struct xattr_iter_handlers { + int (*entry)(struct xattr_iter *, struct erofs_xattr_entry *); + int (*name)(struct xattr_iter *, unsigned, char *, unsigned); + int (*alloc_buffer)(struct xattr_iter *, unsigned); + void (*value)(struct xattr_iter *, unsigned, char *, unsigned); +}; + +static inline int xattr_iter_fixup(struct xattr_iter *it) +{ + if (it->ofs < EROFS_BLKSIZ) + return 0; + + xattr_iter_end(it, true); + + it->blkaddr += erofs_blknr(it->ofs); + + it->page = erofs_get_meta_page(it->sb, it->blkaddr, false); + if (IS_ERR(it->page)) { + int err = PTR_ERR(it->page); + + it->page = NULL; + return err; + } + + it->kaddr = kmap_atomic(it->page); + it->ofs = erofs_blkoff(it->ofs); + return 0; +} + +static int inline_xattr_iter_begin(struct xattr_iter *it, + struct inode *inode) +{ + struct erofs_vnode *const vi = EROFS_V(inode); + struct erofs_sb_info *const sbi = EROFS_SB(inode->i_sb); + unsigned xattr_header_sz, inline_xattr_ofs; + + xattr_header_sz = inlinexattr_header_size(inode); + if (unlikely(xattr_header_sz >= vi->xattr_isize)) { + BUG_ON(xattr_header_sz > vi->xattr_isize); + return -ENOATTR; + } + + inline_xattr_ofs = vi->inode_isize + xattr_header_sz; + + it->blkaddr = erofs_blknr(iloc(sbi, vi->nid) + inline_xattr_ofs); + it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs); + + it->page = erofs_get_inline_page(inode, it->blkaddr); + if (IS_ERR(it->page)) + return PTR_ERR(it->page); + + it->kaddr = kmap_atomic(it->page); + return vi->xattr_isize - xattr_header_sz; +} + +static int xattr_foreach(struct xattr_iter *it, + const struct xattr_iter_handlers *op, unsigned int *tlimit) +{ + struct erofs_xattr_entry entry; + unsigned value_sz, processed, slice; + int err; + + /* 0. fixup blkaddr, ofs, ipage */ + err = xattr_iter_fixup(it); + if (err) + return err; + + /* + * 1. read xattr entry to the memory, + * since we do EROFS_XATTR_ALIGN + * therefore entry should be in the page + */ + entry = *(struct erofs_xattr_entry *)(it->kaddr + it->ofs); + if (tlimit != NULL) { + unsigned entry_sz = EROFS_XATTR_ENTRY_SIZE(&entry); + + BUG_ON(*tlimit < entry_sz); + *tlimit -= entry_sz; + } + + it->ofs += sizeof(struct erofs_xattr_entry); + value_sz = le16_to_cpu(entry.e_value_size); + + /* handle entry */ + err = op->entry(it, &entry); + if (err) { + it->ofs += entry.e_name_len + value_sz; + goto out; + } + + /* 2. handle xattr name (ofs will finally be at the end of name) */ + processed = 0; + + while (processed < entry.e_name_len) { + if (it->ofs >= EROFS_BLKSIZ) { + BUG_ON(it->ofs > EROFS_BLKSIZ); + + err = xattr_iter_fixup(it); + if (err) + goto out; + it->ofs = 0; + } + + slice = min_t(unsigned, PAGE_SIZE - it->ofs, + entry.e_name_len - processed); + + /* handle name */ + err = op->name(it, processed, it->kaddr + it->ofs, slice); + if (err) { + it->ofs += entry.e_name_len - processed + value_sz; + goto out; + } + + it->ofs += slice; + processed += slice; + } + + /* 3. handle xattr value */ + processed = 0; + + if (op->alloc_buffer != NULL) { + err = op->alloc_buffer(it, value_sz); + if (err) { + it->ofs += value_sz; + goto out; + } + } + + while (processed < value_sz) { + if (it->ofs >= EROFS_BLKSIZ) { + BUG_ON(it->ofs > EROFS_BLKSIZ); + + err = xattr_iter_fixup(it); + if (err) + goto out; + it->ofs = 0; + } + + slice = min_t(unsigned, PAGE_SIZE - it->ofs, + value_sz - processed); + op->value(it, processed, it->kaddr + it->ofs, slice); + it->ofs += slice; + processed += slice; + } + +out: + /* we assume that ofs is aligned with 4 bytes */ + it->ofs = EROFS_XATTR_ALIGN(it->ofs); + return err; +} + +struct getxattr_iter { + struct xattr_iter it; + + char *buffer; + int buffer_size, index; + struct qstr name; +}; + +static int xattr_entrymatch(struct xattr_iter *_it, + struct erofs_xattr_entry *entry) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + + return (it->index != entry->e_name_index || + it->name.len != entry->e_name_len) ? -ENOATTR : 0; +} + +static int xattr_namematch(struct xattr_iter *_it, + unsigned processed, char *buf, unsigned len) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + + return memcmp(buf, it->name.name + processed, len) ? -ENOATTR : 0; +} + +static int xattr_checkbuffer(struct xattr_iter *_it, + unsigned value_sz) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + int err = it->buffer_size < value_sz ? -ERANGE : 0; + + it->buffer_size = value_sz; + return it->buffer == NULL ? 1 : err; +} + +static void xattr_copyvalue(struct xattr_iter *_it, + unsigned processed, char *buf, unsigned len) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + + memcpy(it->buffer + processed, buf, len); +} + +static const struct xattr_iter_handlers find_xattr_handlers = { + .entry = xattr_entrymatch, + .name = xattr_namematch, + .alloc_buffer = xattr_checkbuffer, + .value = xattr_copyvalue +}; + +static int inline_getxattr(struct inode *inode, struct getxattr_iter *it) +{ + int ret; + unsigned remaining; + + ret = inline_xattr_iter_begin(&it->it, inode); + if (ret < 0) + return ret; + + remaining = ret; + while (remaining) { + ret = xattr_foreach(&it->it, &find_xattr_handlers, &remaining); + if (ret >= 0) + break; + + if (ret != -ENOATTR) /* -ENOMEM, -EIO, etc. */ + break; + } + xattr_iter_end_final(&it->it); + + return ret < 0 ? ret : it->buffer_size; +} + +static int shared_getxattr(struct inode *inode, struct getxattr_iter *it) +{ + struct erofs_vnode *const vi = EROFS_V(inode); + struct super_block *const sb = inode->i_sb; + struct erofs_sb_info *const sbi = EROFS_SB(sb); + unsigned i; + int ret = -ENOATTR; + + for (i = 0; i < vi->xattr_shared_count; ++i) { + erofs_blk_t blkaddr = + xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]); + + it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); + + if (!i || blkaddr != it->it.blkaddr) { + if (i) + xattr_iter_end(&it->it, true); + + it->it.page = erofs_get_meta_page(sb, blkaddr, false); + if (IS_ERR(it->it.page)) + return PTR_ERR(it->it.page); + + it->it.kaddr = kmap_atomic(it->it.page); + it->it.blkaddr = blkaddr; + } + + ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL); + if (ret >= 0) + break; + + if (ret != -ENOATTR) /* -ENOMEM, -EIO, etc. */ + break; + } + if (vi->xattr_shared_count) + xattr_iter_end_final(&it->it); + + return ret < 0 ? ret : it->buffer_size; +} + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)) +static int erofs_xattr_get_prefix(struct erofs_sb_info *sbi, + int type, const char **prefix) +{ + switch (type) { + case EROFS_XATTR_INDEX_USER: + if (!test_opt(sbi, XATTR_USER)) + return -EOPNOTSUPP; + *prefix = XATTR_USER_PREFIX; + return XATTR_USER_PREFIX_LEN; + + case EROFS_XATTR_INDEX_TRUSTED: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + *prefix = XATTR_TRUSTED_PREFIX; + return XATTR_TRUSTED_PREFIX_LEN; + + case EROFS_XATTR_INDEX_SECURITY: + *prefix = XATTR_SECURITY_PREFIX; + return XATTR_SECURITY_PREFIX_LEN; + } + return -EINVAL; +} + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0)) +static size_t erofs_xattr_generic_list(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) +#else +static size_t erofs_xattr_generic_list(const struct xattr_handler *handler, + struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len) +#endif +{ + struct erofs_sb_info *sbi = EROFS_SB(dentry->d_sb); +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0)) + int type = handler->flags; +#endif + int total_len, prefix_len; + const char *prefix; + + prefix_len = erofs_xattr_get_prefix(sbi, type, &prefix); + if (prefix_len < 0) + return prefix_len; + + total_len = prefix_len + name_len + 1; + if (list && total_len <= list_size) { + memcpy(list, prefix, prefix_len); + memcpy(list + prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +#else +static bool erofs_xattr_user_list(struct dentry *dentry) +{ + return test_opt(EROFS_SB(dentry->d_sb), XATTR_USER); +} + +static bool erofs_xattr_trusted_list(struct dentry *dentry) +{ + return capable(CAP_SYS_ADMIN); +} +#endif + +int erofs_getxattr(struct inode *inode, int index, + const char *name, + void *buffer, size_t buffer_size) +{ + int ret; + struct getxattr_iter it; + + if (unlikely(name == NULL)) + return -EINVAL; + + ret = init_inode_xattrs(inode); + if (ret) + return ret; + + it.index = index; + + it.name.len = strlen(name); + if (it.name.len > EROFS_NAME_LEN) + return -ERANGE; + it.name.name = name; + + it.buffer = buffer; + it.buffer_size = buffer_size; + + it.it.sb = inode->i_sb; + ret = inline_getxattr(inode, &it); + if (ret == -ENOATTR) + ret = shared_getxattr(inode, &it); + return ret; +} + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0)) +static int erofs_xattr_generic_get(struct dentry *dentry, + const char *name, + void *buffer, size_t size, int type) +{ + struct inode *inode = d_inode(dentry); +#else +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 7, 0)) +static int erofs_xattr_generic_get(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, void *buffer, + size_t size) +{ + struct inode *inode = d_inode(dentry); +#else +static int erofs_xattr_generic_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) +{ +#endif +#endif + struct erofs_vnode *const vi = EROFS_V(inode); + struct erofs_sb_info *const sbi = EROFS_I_SB(inode); + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0)) + switch (type) { +#else + switch (handler->flags) { +#endif + + case EROFS_XATTR_INDEX_USER: + if (!test_opt(sbi, XATTR_USER)) + return -EOPNOTSUPP; + break; + case EROFS_XATTR_INDEX_TRUSTED: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + break; + case EROFS_XATTR_INDEX_SECURITY: + break; + default: + return -EINVAL; + } + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0)) + if (name[0] == '\0') + return -EINVAL; +#endif + + if (!vi->xattr_isize) + return -ENOATTR; + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0)) + return erofs_getxattr(inode, type, name, buffer, size); +#else + return erofs_getxattr(inode, handler->flags, name, buffer, size); +#endif +} + +const struct xattr_handler erofs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .flags = EROFS_XATTR_INDEX_USER, +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)) + .list = erofs_xattr_generic_list, +#else + .list = erofs_xattr_user_list, +#endif + + .get = erofs_xattr_generic_get, +}; + +const struct xattr_handler erofs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .flags = EROFS_XATTR_INDEX_TRUSTED, +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)) + .list = erofs_xattr_generic_list, +#else + .list = erofs_xattr_trusted_list, +#endif + .get = erofs_xattr_generic_get, +}; + +#ifdef CONFIG_EROFS_FS_SECURITY +const struct xattr_handler __maybe_unused erofs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = EROFS_XATTR_INDEX_SECURITY, + .get = erofs_xattr_generic_get, +}; +#endif + +const struct xattr_handler *erofs_xattr_handlers[] = { + &erofs_xattr_user_handler, +#ifdef CONFIG_EROFS_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + &erofs_xattr_trusted_handler, +#ifdef CONFIG_EROFS_FS_SECURITY + &erofs_xattr_security_handler, +#endif + NULL, +}; + +struct listxattr_iter { + struct xattr_iter it; + + struct dentry *dentry; + char *buffer; + int buffer_size, buffer_ofs; +}; + +static int xattr_entrylist(struct xattr_iter *_it, + struct erofs_xattr_entry *entry) +{ + struct listxattr_iter *it = + container_of(_it, struct listxattr_iter, it); + unsigned prefix_len; + const char *prefix; + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)) + struct erofs_sb_info *sbi = EROFS_SB(it->dentry->d_sb); + int ret = erofs_xattr_get_prefix(sbi, entry->e_name_index, &prefix); + + if (ret < 0) + return 1; + + prefix_len = ret; +#else + const struct xattr_handler *h = + erofs_xattr_handler(entry->e_name_index); + + if (h == NULL || (h->list != NULL && !h->list(it->dentry))) + return 1; + + /* Note that at least one of 'prefix' and 'name' should be non-NULL */ + prefix = h->prefix != NULL ? h->prefix : h->name; + prefix_len = strlen(prefix); +#endif + + if (it->buffer == NULL) { + it->buffer_ofs += prefix_len + entry->e_name_len + 1; + return 1; + } + + if (it->buffer_ofs + prefix_len + + entry->e_name_len + 1 > it->buffer_size) + return -ERANGE; + + memcpy(it->buffer + it->buffer_ofs, prefix, prefix_len); + it->buffer_ofs += prefix_len; + return 0; +} + +static int xattr_namelist(struct xattr_iter *_it, + unsigned processed, char *buf, unsigned len) +{ + struct listxattr_iter *it = + container_of(_it, struct listxattr_iter, it); + + memcpy(it->buffer + it->buffer_ofs, buf, len); + it->buffer_ofs += len; + return 0; +} + +static int xattr_skipvalue(struct xattr_iter *_it, + unsigned value_sz) +{ + struct listxattr_iter *it = + container_of(_it, struct listxattr_iter, it); + + it->buffer[it->buffer_ofs++] = '\0'; + return 1; +} + +static const struct xattr_iter_handlers list_xattr_handlers = { + .entry = xattr_entrylist, + .name = xattr_namelist, + .alloc_buffer = xattr_skipvalue, + .value = NULL +}; + +static int inline_listxattr(struct listxattr_iter *it) +{ + int ret; + unsigned remaining; + + ret = inline_xattr_iter_begin(&it->it, d_inode(it->dentry)); + if (ret < 0) + return ret; + + remaining = ret; + while (remaining) { + ret = xattr_foreach(&it->it, &list_xattr_handlers, &remaining); + if (ret < 0) + break; + } + xattr_iter_end_final(&it->it); + return ret < 0 ? ret : it->buffer_ofs; +} + +static int shared_listxattr(struct listxattr_iter *it) +{ + struct inode *const inode = d_inode(it->dentry); + struct erofs_vnode *const vi = EROFS_V(inode); + struct super_block *const sb = inode->i_sb; + struct erofs_sb_info *const sbi = EROFS_SB(sb); + unsigned i; + int ret = 0; + + for (i = 0; i < vi->xattr_shared_count; ++i) { + erofs_blk_t blkaddr = + xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]); + + it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); + if (!i || blkaddr != it->it.blkaddr) { + if (i) + xattr_iter_end(&it->it, true); + + it->it.page = erofs_get_meta_page(sb, blkaddr, false); + if (IS_ERR(it->it.page)) + return PTR_ERR(it->it.page); + + it->it.kaddr = kmap_atomic(it->it.page); + it->it.blkaddr = blkaddr; + } + + ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL); + if (ret < 0) + break; + } + if (vi->xattr_shared_count) + xattr_iter_end_final(&it->it); + + return ret < 0 ? ret : it->buffer_ofs; +} + +ssize_t erofs_listxattr(struct dentry *dentry, + char *buffer, size_t buffer_size) +{ + int ret; + struct listxattr_iter it; + + ret = init_inode_xattrs(d_inode(dentry)); + if (ret) + return ret; + + it.dentry = dentry; + it.buffer = buffer; + it.buffer_size = buffer_size; + it.buffer_ofs = 0; + + it.it.sb = dentry->d_sb; + + ret = inline_listxattr(&it); + if (ret < 0 && ret != -ENOATTR) + return ret; + return shared_listxattr(&it); +} + diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h new file mode 100644 index 000000000000..0c7379282fc5 --- /dev/null +++ b/fs/erofs/xattr.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * linux/drivers/staging/erofs/xattr.h + * + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ +#ifndef __EROFS_XATTR_H +#define __EROFS_XATTR_H + +#include "internal.h" +#include +#include + +/* Attribute not found */ +#define ENOATTR ENODATA + +static inline unsigned inlinexattr_header_size(struct inode *inode) +{ + return sizeof(struct erofs_xattr_ibody_header) + + sizeof(u32) * EROFS_V(inode)->xattr_shared_count; +} + +static inline erofs_blk_t +xattrblock_addr(struct erofs_sb_info *sbi, unsigned xattr_id) +{ +#ifdef CONFIG_EROFS_FS_XATTR + return sbi->xattr_blkaddr + + xattr_id * sizeof(__u32) / EROFS_BLKSIZ; +#else + return 0; +#endif +} + +static inline unsigned +xattrblock_offset(struct erofs_sb_info *sbi, unsigned xattr_id) +{ + return (xattr_id * sizeof(__u32)) % EROFS_BLKSIZ; +} + +extern const struct xattr_handler erofs_xattr_user_handler; +extern const struct xattr_handler erofs_xattr_trusted_handler; +#ifdef CONFIG_EROFS_FS_SECURITY +extern const struct xattr_handler erofs_xattr_security_handler; +#endif + +static inline const struct xattr_handler *erofs_xattr_handler(unsigned index) +{ +static const struct xattr_handler *xattr_handler_map[] = { + [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler, +#ifdef CONFIG_EROFS_FS_POSIX_ACL + [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = + &posix_acl_default_xattr_handler, +#endif + [EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler, +#ifdef CONFIG_EROFS_FS_SECURITY + [EROFS_XATTR_INDEX_SECURITY] = &erofs_xattr_security_handler, +#endif +}; + return index && index < ARRAY_SIZE(xattr_handler_map) ? + xattr_handler_map[index] : NULL; +} + +#ifdef CONFIG_EROFS_FS_XATTR + +extern const struct inode_operations erofs_generic_xattr_iops; +extern const struct inode_operations erofs_dir_xattr_iops; + +int erofs_getxattr(struct inode *, int, const char *, void *, size_t); +ssize_t erofs_listxattr(struct dentry *, char *, size_t); +#else +static int __maybe_unused erofs_getxattr(struct inode *inode, int index, + const char *name, + void *buffer, size_t buffer_size) +{ + return -ENOTSUPP; +} + +static ssize_t __maybe_unused erofs_listxattr(struct dentry *dentry, + char *buffer, size_t buffer_size) +{ + return -ENOTSUPP; +} +#endif + +#endif +