mm: Backport ZRAM/ZSMALLOC from Google kernel

Change-Id: Ib07ead1e23e816c96552254c049016825a164f2c UPSTREAM: zram/zcomp: use GFP_NOIO to allocate streams (cherry picked from commit 3d5fe03a3ea013060ebba2a811aeb0f23f56aefa) We can end up allocating a new compression stream with GFP_KERNEL from within the IO path, which may result is nested (recursive) IO operations. That can introduce problems if the IO path in question is a reclaimer, holding some locks that will deadlock nested IOs. Allocate streams and working memory using GFP_NOIO flag, forbidding recursive IO and FS operations. An example: inconsistent {IN-RECLAIM_FS-W} -> {RECLAIM_FS-ON-W} usage. git/20158 [HC0[0]:SC0[0]:HE1:SE1] takes: (jbd2_handle){+.+.?.}, at: start_this_handle+0x4ca/0x555 {IN-RECLAIM_FS-W} state was registered at: __lock_acquire+0x8da/0x117b lock_acquire+0x10c/0x1a7 start_this_handle+0x52d/0x555 jbd2__journal_start+0xb4/0x237 __ext4_journal_start_sb+0x108/0x17e ext4_dirty_inode+0x32/0x61 __mark_inode_dirty+0x16b/0x60c iput+0x11e/0x274 __dentry_kill+0x148/0x1b8 shrink_dentry_list+0x274/0x44a prune_dcache_sb+0x4a/0x55 super_cache_scan+0xfc/0x176 shrink_slab.part.14.constprop.25+0x2a2/0x4d3 shrink_zone+0x74/0x140 kswapd+0x6b7/0x930 kthread+0x107/0x10f ret_from_fork+0x3f/0x70 irq event stamp: 138297 hardirqs last enabled at (138297): debug_check_no_locks_freed+0x113/0x12f hardirqs last disabled at (138296): debug_check_no_locks_freed+0x33/0x12f softirqs last enabled at (137818): __do_softirq+0x2d3/0x3e9 softirqs last disabled at (137813): irq_exit+0x41/0x95 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(jbd2_handle); <Interrupt> lock(jbd2_handle); *** DEADLOCK *** 5 locks held by git/20158: #0: (sb_writers#7){.+.+.+}, at: [<ffffffff81155411>] mnt_want_write+0x24/0x4b tarunkapadia93#1: (&type->i_mutex_dir_key#2/1){+.+.+.}, at: [<ffffffff81145087>] lock_rename+0xd9/0xe3 tarunkapadia93#2: (&sb->s_type->i_mutex_key#11){+.+.+.}, at: [<ffffffff8114f8e2>] lock_two_nondirectories+0x3f/0x6b armani-dev#3: (&sb->s_type->i_mutex_key#11/4){+.+.+.}, at: [<ffffffff8114f909>] lock_two_nondirectories+0x66/0x6b armani-dev#4: (jbd2_handle){+.+.?.}, at: [<ffffffff811e31db>] start_this_handle+0x4ca/0x555 stack backtrace: CPU: 2 PID: 20158 Comm: git Not tainted 4.1.0-rc7-next-20150615-dbg-00016-g8bdf555-dirty #211 Call Trace: dump_stack+0x4c/0x6e mark_lock+0x384/0x56d mark_held_locks+0x5f/0x76 lockdep_trace_alloc+0xb2/0xb5 kmem_cache_alloc_trace+0x32/0x1e2 zcomp_strm_alloc+0x25/0x73 [zram] zcomp_strm_multi_find+0xe7/0x173 [zram] zcomp_strm_find+0xc/0xe [zram] zram_bvec_rw+0x2ca/0x7e0 [zram] zram_make_request+0x1fa/0x301 [zram] generic_make_request+0x9c/0xdb submit_bio+0xf7/0x120 ext4_io_submit+0x2e/0x43 ext4_bio_write_page+0x1b7/0x300 mpage_submit_page+0x60/0x77 mpage_map_and_submit_buffers+0x10f/0x21d ext4_writepages+0xc8c/0xe1b do_writepages+0x23/0x2c __filemap_fdatawrite_range+0x84/0x8b filemap_flush+0x1c/0x1e ext4_alloc_da_blocks+0xb8/0x117 ext4_rename+0x132/0x6dc ? mark_held_locks+0x5f/0x76 ext4_rename2+0x29/0x2b vfs_rename+0x540/0x636 SyS_renameat2+0x359/0x44d SyS_rename+0x1e/0x20 entry_SYSCALL_64_fastpath+0x12/0x6f [[email protected]: add stable mark] Signed-off-by: Sergey Senozhatsky <[email protected]> Acked-by: Minchan Kim <[email protected]> Cc: Kyeongdon Kim <[email protected]> Cc: <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]> UPSTREAM: zram: try vmalloc() after kmalloc() (cherry picked from commit d913897abace843bba20249f3190167f7895e9c3) When we're using LZ4 multi compression streams for zram swap, we found out page allocation failure message in system running test. That was not only once, but a few(2 - 5 times per test). Also, some failure cases were continually occurring to try allocation order 3. In order to make parallel compression private data, we should call kzalloc() with order 2/3 in runtime(lzo/lz4). But if there is no order 2/3 size memory to allocate in that time, page allocation fails. This patch makes to use vmalloc() as fallback of kmalloc(), this prevents page alloc failure warning. After using this, we never found warning message in running test, also It could reduce process startup latency about 60-120ms in each case. For reference a call trace : Binder_1: page allocation failure: order:3, mode:0x10c0d0 CPU: 0 PID: 424 Comm: Binder_1 Tainted: GW 3.10.49-perf-g991d02b-dirty #20 Call trace: dump_backtrace+0x0/0x270 show_stack+0x10/0x1c dump_stack+0x1c/0x28 warn_alloc_failed+0xfc/0x11c __alloc_pages_nodemask+0x724/0x7f0 __get_free_pages+0x14/0x5c kmalloc_order_trace+0x38/0xd8 zcomp_lz4_create+0x2c/0x38 zcomp_strm_alloc+0x34/0x78 zcomp_strm_multi_find+0x124/0x1ec zcomp_strm_find+0xc/0x18 zram_bvec_rw+0x2fc/0x780 zram_make_request+0x25c/0x2d4 generic_make_request+0x80/0xbc submit_bio+0xa4/0x15c __swap_writepage+0x218/0x230 swap_writepage+0x3c/0x4c shrink_page_list+0x51c/0x8d0 shrink_inactive_list+0x3f8/0x60c shrink_lruvec+0x33c/0x4cc shrink_zone+0x3c/0x100 try_to_free_pages+0x2b8/0x54c __alloc_pages_nodemask+0x514/0x7f0 __get_free_pages+0x14/0x5c proc_info_read+0x50/0xe4 vfs_read+0xa0/0x12c SyS_read+0x44/0x74 DMA: 3397*4kB (MC) 26*8kB (RC) 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 13796kB [[email protected]: change vmalloc gfp and adding comment about gfp] [[email protected]: tweak comments and styles] Signed-off-by: Kyeongdon Kim <[email protected]> Signed-off-by: Minchan Kim <[email protected]> Acked-by: Sergey Senozhatsky <[email protected]> Sergey Senozhatsky <[email protected]> Cc: <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]> UPSTREAM: zram: pass gfp from zcomp frontend to backend (cherry picked from commit 75d8947a36d0c9aedd69118d1f14bf424005c7c2) Each zcomp backend uses own gfp flag but it's pointless because the context they could be called is driven by upper layer(ie, zcomp frontend). As well, zcomp frondend could call them in different context. One context(ie, zram init part) is it should be better to make sure successful allocation other context(ie, further stream allocation part for accelarating I/O speed) is just optional so let's pass gfp down from driver (ie, zcomp frontend) like normal MM convention. [[email protected]: add missing __vmalloc zero and highmem gfps] Signed-off-by: Minchan Kim <[email protected]> Signed-off-by: Sergey Senozhatsky <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]> UPSTREAM: zram/zcomp: do not zero out zcomp private pages (cherry picked from commit e02d238c9852a91b30da9ea32ce36d1416cdc683) Do not __GFP_ZERO allocated zcomp ->private pages. We keep allocated streams around and use them for read/write requests, so we supply a zeroed out ->private to compression algorithm as a scratch buffer only once -- the first time we use that stream. For the rest of IO requests served by this stream ->private usually contains some temporarily data from the previous requests. Signed-off-by: Sergey Senozhatsky <[email protected]> Acked-by: Minchan Kim <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]> UPSTREAM: block: disable entropy contributions for nonrot devices (cherry picked from commit b277da0a8a594308e17881f4926879bd5fca2a2d) Clear QUEUE_FLAG_ADD_RANDOM in all block drivers that set QUEUE_FLAG_NONROT. Historically, all block devices have automatically made entropy contributions. But as previously stated in commit e2e1a14 ("block: add sysfs knob for turning off disk entropy contributions"): - On SSD disks, the completion times aren't as random as they are for rotational drives. So it's questionable whether they should contribute to the random pool in the first place. - Calling add_disk_randomness() has a lot of overhead. There are more reliable sources for randomness than non-rotational block devices. From a security perspective it is better to err on the side of caution than to allow entropy contributions from unreliable "random" sources. Change-Id: I2a4f86bacee8786e2cb1a82d45156338f79d64e0 Signed-off-by: Mike Snitzer <[email protected]> Signed-off-by: Jens Axboe <[email protected]> Signed-off-by: hurtsky <[email protected]> Conflicts: drivers/block/zram/zram_drv.c drivers/staging/Kconfig drivers/staging/Makefile mm/Kconfig mm/Makefile Signed-off-by: hemantbeast <[email protected]>
hemantbeast · Dec 11, 2016 · 1795395 · 1795395
1 parent 23dfabd
commit 1795395
Show file tree

Hide file tree

Showing 8 changed files with 15 additions and 3 deletions.
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
@@ -557,4 +557,6 @@ config BLK_DEV_RBD
 
 	  If unsure, say N.
 
+source "drivers/block/zram/Kconfig"
+
 endif # BLK_DEV
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
@@ -3632,6 +3632,7 @@ static int mtip_block_initialize(struct driver_data *dd)
 
 	/* Set device limits. */
 	set_bit(QUEUE_FLAG_NONROT, &dd->queue->queue_flags);
+	clear_bit(QUEUE_FLAG_ADD_RANDOM, &dd->queue->queue_flags);
 	blk_queue_max_segments(dd->queue, MTIP_MAX_SG);
 	blk_queue_physical_block_size(dd->queue, 4096);
 	blk_queue_io_min(dd->queue, 4096);

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
@@ -895,6 +895,7 @@ static int create_device(struct zram *zram, int device_id)
 	set_capacity(zram->disk, 0);
 	/* zram devices sort of resembles non-rotational disks */
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue);
+	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue);
 	/*
 	 * To ensure that we always get PAGE_SIZE aligned
 	 * and n*PAGE_SIZED sized I/O requests.

diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
@@ -686,8 +686,10 @@ static void ide_disk_setup(ide_drive_t *drive)
 	printk(KERN_INFO "%s: max request size: %dKiB\n", drive->name,
 	       queue_max_sectors(q) / 2);
 
-	if (ata_id_is_ssd(id))
+	if (ata_id_is_ssd(id)) {
 		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
+		queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q);
+	}
 
 	/* calculate drive capacity, and select LBA if possible */
 	ide_disk_get_capacity(drive);

diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
@@ -300,6 +300,7 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
 
 	blk_queue_prep_rq(mq->queue, mmc_prep_request);
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mq->queue);
+	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, mq->queue);
 	if (mmc_can_erase(card))
 		mmc_queue_setup_discard(mq->queue, card);
 

diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
@@ -436,6 +436,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
 	blk_queue_logical_block_size(new->rq, tr->blksize);
 
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, new->rq);
+	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, new->rq);
 
 	if (tr->discard) {
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, new->rq);

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
@@ -2384,8 +2384,10 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp)
 
 	rot = get_unaligned_be16(&buffer[4]);
 
-	if (rot == 1)
+	if (rot == 1) {
 		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, sdkp->disk->queue);
+		queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, sdkp->disk->queue);
+	}
 
  out:
 	kfree(buffer);

diff --git a/mm/Kconfig b/mm/Kconfig
@@ -425,7 +425,7 @@ config ZPOOL
 	  zsmalloc.
 
 config ZSMALLOC
-	bool "Memory allocator for compressed pages"
+	tristate "Memory allocator for compressed pages"
 	depends on MMU
 	default n
 	help
@@ -445,3 +445,5 @@ config ZSMALLOC_STAT
 	  statistics about whats happening in zsmalloc and exports that
 	  information to userspace via debugfs.
 	  If unsure, say N.
+=======
+>>>>>>> a2fbcc7... mm: Backport ZRAM/ZSMALLOC from Google kernel
Original file line number	Diff line number	Diff line change
Expand Up		@@ -557,4 +557,6 @@ config BLK_DEV_RBD

		If unsure, say N.

		source "drivers/block/zram/Kconfig"

		endif # BLK_DEV