#! /bin/bash
# SPDX-License-Identifier: GPL-2.0
# Copyright (C) 2024 Luis Chamberlain. All Rights Reserved.
#
# FS QA Test No. 751
#
# stress page cache truncation + writeback
#
# This aims at trying to reproduce a difficult to reproduce bug found with
# min order. The issue was root caused to an xarray bug when we split folios
# to another order other than 0. This functionality is used to support min
# order. The crash:
#
# https://gist.github.com/mcgrof/d12f586ec6ebe32b2472b5d634c397df
# Crash excerpt is as follows:
#
# BUG: kernel NULL pointer dereference, address: 0000000000000036
# #PF: supervisor read access in kernel mode
# #PF: error_code(0x0000) - not-present page
# PGD 0 P4D 0
# Oops: 0000 [#1] PREEMPT SMP NOPTI
# CPU: 7 PID: 2190 Comm: kworker/u38:5 Not tainted 6.9.0-rc5+ #14
# Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
# Workqueue: writeback wb_workfn (flush-7:5)
# RIP: 0010:filemap_get_folios_tag+0xa9/0x200
# Call Trace:
#  <TASK>
#   writeback_iter+0x17d/0x310
#  write_cache_pages+0x42/0xa0
#  iomap_writepages+0x33/0x50
#  xfs_vm_writepages+0x63/0x90 [xfs]
#  do_writepages+0xcc/0x260
#  __writeback_single_inode+0x3d/0x340
#  writeback_sb_inodes+0x1ed/0x4b0
#  __writeback_inodes_wb+0x4c/0xe0
#  wb_writeback+0x267/0x2d0
#  wb_workfn+0x2a4/0x440
#  process_one_work+0x189/0x3b0
#  worker_thread+0x273/0x390
#  kthread+0xda/0x110
#  ret_from_fork+0x2d/0x50
#  ret_from_fork_asm+0x1a/0x30
#  </TASK>
#
# This may also find future truncation bugs in the future, as truncating any
# mapped file through the collateral of using echo 1 > split_huge_pages will
# always respect the min order. Truncating to a larger order then is excercised
# when this test is run against any filesystem LBS profile or an LBS device.
#
# If you're enabling this and want to check underneath the hood you may want to
# enable:
#
# dyndbg='file mm/huge_memory.c +p'
#
# This tests aims at increasing the rate of successful truncations so we want
# to increase the value of thp_split_page in $seqres.full. Using echo 1 >
# split_huge_pages is extremely aggressive, and even accounts for anonymous
# memory on a system, however we accept that tradeoff for the efficiency of
# doing the work in-kernel for any mapped file too. Our general goal here is to
# race with folio truncation + writeback.

. ./common/preamble

_begin_fstest auto long_rw stress soak smoketest

# Override the default cleanup function.
_cleanup()
{
	cd /
	rm -f $tmp.*
	rm -f $runfile
	kill -9 $split_huge_pages_files_pid > /dev/null 2>&1
	wait
}

fio_config=$tmp.fio
fio_out=$tmp.fio.out
fio_err=$tmp.fio.err

_require_test
_require_scratch
_require_split_huge_pages_knob
_fixed_by_git_commit kernel 2a0774c2886d \
	"XArray: set the marks correctly when splitting an entry"

proc_vmstat()
{
	awk -v name="$1" '{if ($1 ~ name) {print($2)}}' /proc/vmstat | head -1
}

# we need buffered IO to force truncation races with writeback in the
# page cache
cat >$fio_config <<EOF
[force_large_large_folio_parallel_writes]
ignore_error=ENOSPC
nrfiles=10
direct=0
bs=4M
group_reporting=1
filesize=1GiB
readwrite=write
fallocate=none
numjobs=$(nproc)
directory=$SCRATCH_MNT
runtime=100*${TIME_FACTOR}
time_based
EOF

_require_fio $fio_config

echo "Silence is golden"

_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount >> $seqres.full 2>&1

# used to let our loops know when to stop
runfile="$tmp.keep.running.loop"
touch $runfile

# The background ops are out of bounds, the goal is to race with fsstress.

# Force folio split if possible, this seems to be screaming for MADV_NOHUGEPAGE
# for large folios.
while [ -e $runfile ]; do
	_split_huge_pages_all >/dev/null 2>&1
done &
split_huge_pages_files_pid=$!

split_count_before=0
split_count_failed_before=0

if grep -q thp_split_page /proc/vmstat; then
	split_count_before=$(proc_vmstat thp_split_page)
	split_count_failed_before=$(proc_vmstat thp_split_page_failed)
else
	echo "no thp_split_page in /proc/vmstat" >> $seqres.full
fi

# we blast away with large writes to force large folio writes when
# possible.
echo -e "Running fio with config:\n" >> $seqres.full
cat $fio_config >> $seqres.full
$FIO_PROG $fio_config --alloc-size=$(( $(nproc) * 8192 )) \
	--output=$fio_out 2> $fio_err
FIO_ERR=$?

rm -f $runfile

wait > /dev/null 2>&1

if grep -q thp_split_page /proc/vmstat; then
	split_count_after=$(proc_vmstat thp_split_page)
	split_count_failed_after=$(proc_vmstat thp_split_page_failed)
	thp_split_page=$((split_count_after - split_count_before))
	thp_split_page_failed=$((split_count_failed_after - split_count_failed_before))

	echo "vmstat thp_split_page: $thp_split_page" >> $seqres.full
	echo "vmstat thp_split_page_failed: $thp_split_page_failed" >> $seqres.full
fi

# exitall_on_error=ENOSPC does not work as it should, so we need this eyesore
if [[ $FIO_ERR -ne 0 ]] && ! grep -q "No space left on device" $fio_err; then
	_fail "fio failed with err: $FIO_ERR"
fi

status=0
exit
