From cedcf08f43dab0bf27e7a4e9bc82f27ccf89241d Mon Sep 17 00:00:00 2001 From: Zhang Zekun Date: Fri, 6 Sep 2024 13:57:42 +0800 Subject: [PATCH 01/71] ocfs2: remove unused declaration in header file The definition of ocfs2_global_read_dquot() has been removed since commit fb8dd8d78014 ("ocfs2: Fix quota locking"). Let's remove the empty declartion Link: https://lkml.kernel.org/r/20240906055742.105024-1-zhangzekun11@huawei.com Signed-off-by: Zhang Zekun Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/quota.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index ebb5c99f490e..788a8de922a4 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -97,7 +97,6 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); int ocfs2_global_read_info(struct super_block *sb, int type); int ocfs2_global_write_info(struct super_block *sb, int type); -int ocfs2_global_read_dquot(struct dquot *dquot); int __ocfs2_sync_dquot(struct dquot *dquot, int freeing); static inline int ocfs2_sync_dquot(struct dquot *dquot) { From 5c50b3b8cfefbe306f2b348eec0663b458d70221 Mon Sep 17 00:00:00 2001 From: Mohammed Anees Date: Tue, 17 Sep 2024 18:51:56 +0000 Subject: [PATCH 02/71] ocfs2: fix typo in comment Fix "Allcate" -> "Allocate" Link: https://lkml.kernel.org/r/20240917185156.10580-1-pvmohammedanees2003@gmail.com Signed-off-by: Mohammed Anees Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index ea9127ba3208..395e23920632 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -4767,7 +4767,7 @@ int ocfs2_insert_extent(handle_t *handle, } /* - * Allcate and add clusters into the extent b-tree. + * Allocate and add clusters into the extent b-tree. * The new clusters(clusters_to_add) will be inserted at logical_offset. * The extent b-tree's root is specified by et, and * it is not limited to the file storage. Any extent tree can use this From 6efbd5ddb6af0408301b4c15b413e6425c7650b2 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Sat, 21 Sep 2024 16:07:45 +0530 Subject: [PATCH 03/71] kexec/crash: no crash update when kexec in progress The following errors are observed when kexec is done with SMT=off on powerpc. [ 358.458385] Removing IBM Power 842 compression device [ 374.795734] kexec_core: Starting new kernel [ 374.795748] kexec: Waking offline cpu 1. [ 374.875695] crash hp: kexec_trylock() failed, elfcorehdr may be inaccurate [ 374.935833] kexec: Waking offline cpu 2. [ 375.015664] crash hp: kexec_trylock() failed, elfcorehdr may be inaccurate snip.. [ 375.515823] kexec: Waking offline cpu 6. [ 375.635667] crash hp: kexec_trylock() failed, elfcorehdr may be inaccurate [ 375.695836] kexec: Waking offline cpu 7. To avoid kexec kernel boot failure on PowerPC, all the present CPUs that are offline are brought online during kexec. For more information, refer to commit e8e5c2155b00 ("powerpc/kexec: Fix orphaned offline CPUs across kexec"). Bringing the CPUs online triggers the crash hotplug handler, crash_handle_hotplug_event(), to update the kdump image. Since the system is on the kexec kernel boot path and the kexec lock is held, the crash_handle_hotplug_event() function fails to acquire the same lock to update the kdump image, resulting in the error messages mentioned above. To fix this, return from crash_handle_hotplug_event() without printing the error message if kexec is in progress. The same applies to the crash_check_hotplug_support() function. Return 0 if kexec is in progress because kernel is not in a position to update the kdump image. Link: https://lkml.kernel.org/r/20240921103745.560430-1-sourabhjain@linux.ibm.com Signed-off-by: Sourabh Jain Acked-by: Baoquan he Reported-by: Sachin P Bappalige Cc: Hari Bathini Cc: Michael Ellerman Signed-off-by: Andrew Morton --- kernel/crash_core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index c1048893f4b6..078fe5bc5a74 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -505,7 +505,8 @@ int crash_check_hotplug_support(void) crash_hotplug_lock(); /* Obtain lock while reading crash information */ if (!kexec_trylock()) { - pr_info("kexec_trylock() failed, kdump image may be inaccurate\n"); + if (!kexec_in_progress) + pr_info("kexec_trylock() failed, kdump image may be inaccurate\n"); crash_hotplug_unlock(); return 0; } @@ -547,7 +548,8 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu, crash_hotplug_lock(); /* Obtain lock while changing crash information */ if (!kexec_trylock()) { - pr_info("kexec_trylock() failed, kdump image may be inaccurate\n"); + if (!kexec_in_progress) + pr_info("kexec_trylock() failed, kdump image may be inaccurate\n"); crash_hotplug_unlock(); return; } From 838010180241f5a9779a9ef9a621cdd2842f7354 Mon Sep 17 00:00:00 2001 From: Tio Zhang Date: Fri, 6 Sep 2024 17:47:00 +0800 Subject: [PATCH 04/71] kernel/watchdog: always restore watchdog_softlockup(,hardlockup)_user_enabled after proc show MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Otherwise when watchdog_enabled becomes 0, watchdog_softlockup(,hardlockup)_user_enabled will changes to 0 after proc show. Steps to reproduce: step 1: # cat /proc/sys/kernel/*watchdog 1 1 1 | name | value |----------------------------------|-------------------------- | watchdog_enabled | 1 |----------------------------------|-------------------------- | watchdog_hardlockup_user_enabled | 1 |----------------------------------|-------------------------- | watchdog_softlockup_user_enabled | 1 |----------------------------------|-------------------------- | watchdog_user_enabled | 1 |----------------------------------|-------------------------- step 2: # echo 0 > /proc/sys/kernel/watchdog | name | value |----------------------------------|-------------------------- | watchdog_enabled | 0 |----------------------------------|-------------------------- | watchdog_hardlockup_user_enabled | 1 |----------------------------------|-------------------------- | watchdog_softlockup_user_enabled | 1 |----------------------------------|-------------------------- | watchdog_user_enabled | 0 |----------------------------------|-------------------------- step 3: # cat /proc/sys/kernel/*watchdog 0 0 0 | name | value |----------------------------------|-------------------------- | watchdog_enabled | 0 |----------------------------------|-------------------------- | watchdog_hardlockup_user_enabled | 0 |----------------------------------|-------------------------- | watchdog_softlockup_user_enabled | 0 |----------------------------------|-------------------------- | watchdog_user_enabled | 0 |----------------------------------|-------------------------- step 4: # echo 1 > /proc/sys/kernel/watchdog | name | value |----------------------------------|-------------------------- | watchdog_enabled | 0 |----------------------------------|-------------------------- | watchdog_hardlockup_user_enabled | 0 |----------------------------------|-------------------------- | watchdog_softlockup_user_enabled | 0 |----------------------------------|-------------------------- | watchdog_user_enabled | 0 |----------------------------------|-------------------------- step 5: # cat /proc/sys/kernel/*watchdog 0 0 0 If we dont do "step 3", do "step 4" right after "step 2", it will be | name | value |----------------------------------|-------------------------- | watchdog_enabled | 1 |----------------------------------|-------------------------- | watchdog_hardlockup_user_enabled | 1 |----------------------------------|-------------------------- | watchdog_softlockup_user_enabled | 1 |----------------------------------|-------------------------- | watchdog_user_enabled | 1 |----------------------------------|-------------------------- then everything works correctly. So this patch fix "step 3"'s value into | name | value |----------------------------------|-------------------------- | watchdog_enabled | 0 |----------------------------------|-------------------------- | watchdog_hardlockup_user_enabled | 1 |----------------------------------|-------------------------- | watchdog_softlockup_user_enabled | 1 |----------------------------------|-------------------------- | watchdog_user_enabled | 0 |----------------------------------|-------------------------- And still print 0 as before. Link: https://lkml.kernel.org/r/20240906094700.GA30052@didi-ThinkCentre-M930t-N000 Signed-off-by: Tio Zhang Reviewed-by: Douglas Anderson Cc: Ben Segall Cc: Daniel Bristot de Oliveira Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: John Ogness Cc: Juri Lelli Cc: Krister Johansen Cc: Li Zhe Cc: Luis Chamberlain Cc: Mel Gorman Cc: Peter Zijlstra Cc: Steven Rostedt (Google) Cc: Thomas Gleixner Cc: Thomas Weißschuh Cc: Valentin Schneider Cc: Vincent Guittot Signed-off-by: Andrew Morton --- kernel/watchdog.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 262691ba62b7..6c91b6b72f51 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -990,6 +990,7 @@ static int proc_watchdog_common(int which, const struct ctl_table *table, int wr mutex_lock(&watchdog_mutex); + old = *param; if (!write) { /* * On read synchronize the userspace interface. This is a @@ -997,8 +998,8 @@ static int proc_watchdog_common(int which, const struct ctl_table *table, int wr */ *param = (watchdog_enabled & which) != 0; err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + *param = old; } else { - old = READ_ONCE(*param); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!err && old != READ_ONCE(*param)) proc_watchdog_update(); From 5c1edea773c98707fbb23d1df168bcff52f61e4b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 25 Sep 2024 18:43:34 +0300 Subject: [PATCH 05/71] resource: replace open coded resource_intersection() Patch series "resource: A couple of cleanups". A couple of ad-hoc cleanups since there was a recent development of the code in question. No functional changes intended. This patch (of 2): __region_intersects() uses open coded resource_intersection(). Replace it with existing API which also make more clear what we are checking. Link: https://lkml.kernel.org/r/20240925154355.1170859-1-andriy.shevchenko@linux.intel.com Link: https://lkml.kernel.org/r/20240925154355.1170859-2-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Rasmus Villemoes Signed-off-by: Andrew Morton --- kernel/resource.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 4101016e8b20..1c77ac239c7a 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -537,17 +537,16 @@ static int __region_intersects(struct resource *parent, resource_size_t start, size_t size, unsigned long flags, unsigned long desc) { - resource_size_t ostart, oend; int type = 0; int other = 0; struct resource *p, *dp; + struct resource res, o; bool is_type, covered; - struct resource res; res.start = start; res.end = start + size - 1; for (p = parent->child; p ; p = p->sibling) { - if (!resource_overlaps(p, &res)) + if (!resource_intersection(p, &res, &o)) continue; is_type = (p->flags & flags) == flags && (desc == IORES_DESC_NONE || desc == p->desc); @@ -568,8 +567,6 @@ static int __region_intersects(struct resource *parent, resource_size_t start, * |-- "System RAM" --||-- "CXL Window 0a" --| */ covered = false; - ostart = max(res.start, p->start); - oend = min(res.end, p->end); for_each_resource(p, dp, false) { if (!resource_overlaps(dp, &res)) continue; @@ -578,17 +575,17 @@ static int __region_intersects(struct resource *parent, resource_size_t start, if (is_type) { type++; /* - * Range from 'ostart' to 'dp->start' + * Range from 'o.start' to 'dp->start' * isn't covered by matched resource. */ - if (dp->start > ostart) + if (dp->start > o.start) break; - if (dp->end >= oend) { + if (dp->end >= o.end) { covered = true; break; } /* Remove covered range */ - ostart = max(ostart, dp->end + 1); + o.start = max(o.start, dp->end + 1); } } if (!covered) From ba1eccc114ffc62c4495a5e15659190fa2c42308 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 25 Sep 2024 18:43:35 +0300 Subject: [PATCH 06/71] resource: introduce is_type_match() helper and use it There are already a couple of places where we may replace a few lines of code by calling a helper, which increases readability while deduplicating the code. Introduce is_type_match() helper and use it. Link: https://lkml.kernel.org/r/20240925154355.1170859-3-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Rasmus Villemoes Signed-off-by: Andrew Morton --- kernel/resource.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 1c77ac239c7a..55bc09f50e21 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -297,6 +297,11 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); +static bool is_type_match(struct resource *p, unsigned long flags, unsigned long desc) +{ + return (p->flags & flags) == flags && (desc == IORES_DESC_NONE || desc == p->desc); +} + /** * find_next_iomem_res - Finds the lowest iomem resource that covers part of * [@start..@end]. @@ -339,13 +344,9 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, if (p->end < start) continue; - if ((p->flags & flags) != flags) - continue; - if ((desc != IORES_DESC_NONE) && (desc != p->desc)) - continue; - /* Found a match, break */ - break; + if (is_type_match(p, flags, desc)) + break; } if (p) { @@ -540,7 +541,7 @@ static int __region_intersects(struct resource *parent, resource_size_t start, int type = 0; int other = 0; struct resource *p, *dp; struct resource res, o; - bool is_type, covered; + bool covered; res.start = start; res.end = start + size - 1; @@ -548,9 +549,7 @@ static int __region_intersects(struct resource *parent, resource_size_t start, for (p = parent->child; p ; p = p->sibling) { if (!resource_intersection(p, &res, &o)) continue; - is_type = (p->flags & flags) == flags && - (desc == IORES_DESC_NONE || desc == p->desc); - if (is_type) { + if (is_type_match(p, flags, desc)) { type++; continue; } @@ -570,9 +569,7 @@ static int __region_intersects(struct resource *parent, resource_size_t start, for_each_resource(p, dp, false) { if (!resource_overlaps(dp, &res)) continue; - is_type = (dp->flags & flags) == flags && - (desc == IORES_DESC_NONE || desc == dp->desc); - if (is_type) { + if (is_type_match(dp, flags, desc)) { type++; /* * Range from 'o.start' to 'dp->start' From 9357bf5e66660cf56da44905ff2b158056bc6087 Mon Sep 17 00:00:00 2001 From: Yu Jiaoliang Date: Thu, 26 Sep 2024 18:16:15 +0800 Subject: [PATCH 07/71] scripts/spelling.txt: add more spellings corrections Add several common typo patterns to the scripts/spelling.txt file to ensure checkpatch.pl can detect and prevent these typos in the future. This update helps improve code quality by preventing recurring typos. Link: https://lkml.kernel.org/r/20240926101617.3988613-1-yujiaoliang@vivo.com Signed-off-by: Yu Jiaoliang Signed-off-by: Shen Lichuan Signed-off-by: Yan Zhen Cc: Colin Ian King Cc: Jesse Brandeburg Signed-off-by: Andrew Morton --- scripts/spelling.txt | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index 554329a074ce..fe25bf3b605a 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -141,6 +141,7 @@ anomoly||anomaly anonynous||anonymous anway||anyway aplication||application +apeared||appeared appearence||appearance applicaion||application appliction||application @@ -155,6 +156,7 @@ apropriate||appropriate aquainted||acquainted aquired||acquired aquisition||acquisition +aquires||acquires arbitary||arbitrary architechture||architecture archtecture||architecture @@ -185,10 +187,12 @@ assotiated||associated asssert||assert assum||assume assumtpion||assumption +asume||assume asuming||assuming asycronous||asynchronous asychronous||asynchronous asynchnous||asynchronous +asynchrnous||asynchronous asynchronus||asynchronous asynchromous||asynchronous asymetric||asymmetric @@ -269,6 +273,7 @@ caculate||calculate caculation||calculation cadidate||candidate cahces||caches +calcluate||calculate calender||calendar calescing||coalescing calibraiton||calibration @@ -331,6 +336,7 @@ chouse||chose circumvernt||circumvent claread||cleared clared||cleared +clearify||clarify closeing||closing clustred||clustered cnfiguration||configuration @@ -379,12 +385,14 @@ comsumed||consumed comunicate||communicate comunication||communication conbination||combination +concurent||concurrent conditionaly||conditionally conditon||condition condtion||condition condtional||conditional conected||connected conector||connector +configed||configured configration||configuration configred||configured configuartion||configuration @@ -394,6 +402,7 @@ configuratoin||configuration configuraton||configuration configuretion||configuration configutation||configuration +congiuration||configuration conider||consider conjuction||conjunction connecetd||connected @@ -403,6 +412,7 @@ connnection||connection connnections||connections consistancy||consistency consistant||consistent +consits||consists containes||contains containts||contains contaisn||contains @@ -452,6 +462,7 @@ decendants||descendants decompres||decompress decsribed||described decription||description +detault||default dectected||detected defailt||default deferal||deferral @@ -487,6 +498,7 @@ depreacte||deprecate desactivate||deactivate desciptor||descriptor desciptors||descriptors +descritpor||descriptor descripto||descriptor descripton||description descrition||description @@ -601,6 +613,7 @@ enchanced||enhanced encorporating||incorporating encrupted||encrypted encrypiton||encryption +encryped||encrypted encryptio||encryption endianess||endianness enpoint||endpoint @@ -630,6 +643,7 @@ etsbalishment||establishment evalute||evaluate evalutes||evaluates evalution||evaluation +evaulated||evaluated excecutable||executable excceed||exceed exceded||exceeded @@ -650,6 +664,7 @@ exlcude||exclude exlcuding||excluding exlcusive||exclusive exlusive||exclusive +exlicitly||explicitly exmaple||example expecially||especially experies||expires @@ -834,6 +849,7 @@ informations||information informtion||information infromation||information ingore||ignore +inheritence||inheritance inital||initial initalized||initialized initalised||initialized @@ -878,6 +894,7 @@ interoprability||interoperability interuupt||interrupt interupt||interrupt interupts||interrupts +interurpt||interrupt interrface||interface interrrupt||interrupt interrup||interrupt @@ -925,6 +942,7 @@ jumpimng||jumping juse||just jus||just kown||known +lable||label langage||language langauage||language langauge||language @@ -995,6 +1013,7 @@ metdata||metadata micropone||microphone microprocesspr||microprocessor migrateable||migratable +miliseconds||milliseconds millenium||millennium milliseonds||milliseconds minimim||minimum @@ -1132,6 +1151,7 @@ palne||plane paramameters||parameters paramaters||parameters paramater||parameter +paramenters||parameters parametes||parameters parametised||parametrised paramter||parameter @@ -1177,9 +1197,11 @@ poiter||pointer posible||possible positon||position possibilites||possibilities +postion||position potocol||protocol powerfull||powerful pramater||parameter +preambule||preamble preamle||preamble preample||preamble preapre||prepare @@ -1269,6 +1291,7 @@ raoming||roaming reasearcher||researcher reasearchers||researchers reasearch||research +recalcualte||recalculate receieve||receive recepient||recipient recevied||received @@ -1291,6 +1314,7 @@ refcounf||refcount refence||reference refered||referred referenace||reference +refererence||reference refering||referring refernces||references refernnce||reference @@ -1315,12 +1339,14 @@ reloade||reload remoote||remote remore||remote removeable||removable +repective||respective repectively||respectively replacable||replaceable replacments||replacements replys||replies reponse||response representaion||representation +repsonse||response reqeust||request reqister||register requed||requeued @@ -1444,6 +1470,7 @@ soluation||solution souce||source speach||speech specfic||specific +specfication||specification specfield||specified speciefied||specified specifc||specific @@ -1544,6 +1571,7 @@ syncronus||synchronous syste||system sytem||system sythesis||synthesis +tagert||target taht||that tained||tainted tarffic||traffic @@ -1574,6 +1602,7 @@ tiggers||triggers tiggered||triggered tipically||typically timeing||timing +timming||timing timout||timeout tmis||this toogle||toggle @@ -1597,8 +1626,10 @@ transision||transition transistioned||transitioned transmittd||transmitted transormed||transformed +trasaction||transaction trasfer||transfer trasmission||transmission +trasmitter||transmitter treshold||threshold triggerd||triggered trigerred||triggered From f9a4d8930f272fd76edc1f76062b5ca316bda25d Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 30 Sep 2024 21:58:22 +0200 Subject: [PATCH 08/71] ipc/msg: replace one-element array with flexible array member Replace the deprecated one-element array with a modern flexible array member in the struct compat_msgbuf. There are no binary differences after this conversion. Link: https://github.com/KSPP/linux/issues/79 Link: https://lkml.kernel.org/r/20240930195824.153648-2-thorsten.blum@linux.dev Signed-off-by: Thorsten Blum Cc: "Sun, Jiebin" Cc: Tim Chen Signed-off-by: Andrew Morton --- ipc/msg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipc/msg.c b/ipc/msg.c index fd08b3cb36d7..ee6af4fe52bf 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -978,7 +978,7 @@ SYSCALL_DEFINE4(msgsnd, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz, struct compat_msgbuf { compat_long_t mtype; - char mtext[1]; + char mtext[]; }; long compat_ksys_msgsnd(int msqid, compat_uptr_t msgp, From 4cc0473d7754d387680bdf0728eb29f0ec8834bf Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 7 Oct 2024 22:49:05 +0800 Subject: [PATCH 09/71] get rid of __get_task_comm() Patch series "Improve the copy of task comm", v8. Using {memcpy,strncpy,strcpy,kstrdup} to copy the task comm relies on the length of task comm. Changes in the task comm could result in a destination string that is overflow. Therefore, we should explicitly ensure the destination string is always NUL-terminated, regardless of the task comm. This approach will facilitate future extensions to the task comm. As suggested by Linus [0], we can identify all relevant code with the following git grep command: git grep 'memcpy.*->comm\>' git grep 'kstrdup.*->comm\>' git grep 'strncpy.*->comm\>' git grep 'strcpy.*->comm\>' PATCH #2~#4: memcpy PATCH #5~#6: kstrdup PATCH #7: strcpy Please note that strncpy() is not included in this series as it is being tracked by another effort. [1] This patch (of 7): We want to eliminate the use of __get_task_comm() for the following reasons: - The task_lock() is unnecessary Quoted from Linus [0]: : Since user space can randomly change their names anyway, using locking : was always wrong for readers (for writers it probably does make sense : to have some lock - although practically speaking nobody cares there : either, but at least for a writer some kind of race could have : long-term mixed results Link: https://lkml.kernel.org/r/20241007144911.27693-1-laoar.shao@gmail.com Link: https://lkml.kernel.org/r/20241007144911.27693-2-laoar.shao@gmail.com Link: https://lore.kernel.org/all/CAHk-=wivfrF0_zvf+oj6==Sh=-npJooP8chLPEfaFV0oNYTTBA@mail.gmail.com [0] Link: https://lore.kernel.org/all/CAHk-=whWtUC-AjmGJveAETKOMeMFSTwKwu99v7+b6AyHMmaDFA@mail.gmail.com/ Link: https://lore.kernel.org/all/CAHk-=wjAmmHUg6vho1KjzQi2=psR30+CogFd4aXrThr2gsiS4g@mail.gmail.com/ [0] Link: https://github.com/KSPP/linux/issues/90 [1] Signed-off-by: Yafang Shao Suggested-by: Linus Torvalds Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Eric Biederman Cc: Kees Cook Cc: Alexei Starovoitov Cc: Matus Jokay Cc: Alejandro Colomar Cc: "Serge E. Hallyn" Cc: Catalin Marinas Cc: Justin Stitt Cc: Steven Rostedt (Google) Cc: Tetsuo Handa Cc: Andy Shevchenko Cc: Daniel Vetter Cc: David Airlie Cc: Eric Paris Cc: James Morris Cc: Maarten Lankhorst Cc: Matthew Wilcox Cc: Maxime Ripard Cc: Ondrej Mosnacek Cc: Paul Moore Cc: Quentin Monnet Cc: Simon Horman Cc: Stephen Smalley Cc: Thomas Zimmermann Signed-off-by: Andrew Morton --- fs/exec.c | 10 ---------- fs/proc/array.c | 2 +- include/linux/sched.h | 28 ++++++++++++++++++++++------ kernel/kthread.c | 2 +- 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 6c53920795c2..77364806b48d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1189,16 +1189,6 @@ static int unshare_sighand(struct task_struct *me) return 0; } -char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk) -{ - task_lock(tsk); - /* Always NUL terminated and zero-padded */ - strscpy_pad(buf, tsk->comm, buf_size); - task_unlock(tsk); - return buf; -} -EXPORT_SYMBOL_GPL(__get_task_comm); - /* * These functions flushes out all traces of the currently running executable * so that a new one can be started diff --git a/fs/proc/array.c b/fs/proc/array.c index 34a47fb0c57f..55ed3510d2bb 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -109,7 +109,7 @@ void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape) else if (p->flags & PF_KTHREAD) get_kthread_comm(tcomm, sizeof(tcomm), p); else - __get_task_comm(tcomm, sizeof(tcomm), p); + get_task_comm(tcomm, p); if (escape) seq_escape_str(m, tcomm, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); diff --git a/include/linux/sched.h b/include/linux/sched.h index bb343136ddd0..67718d5591dd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1121,9 +1121,12 @@ struct task_struct { /* * executable name, excluding path. * - * - normally initialized setup_new_exec() - * - access it with [gs]et_task_comm() - * - lock it with task_lock() + * - normally initialized begin_new_exec() + * - set it with set_task_comm() + * - strscpy_pad() to ensure it is always NUL-terminated and + * zero-padded + * - task_lock() to ensure the operation is atomic and the name is + * fully updated. */ char comm[TASK_COMM_LEN]; @@ -1938,10 +1941,23 @@ static inline void set_task_comm(struct task_struct *tsk, const char *from) __set_task_comm(tsk, from, false); } -extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); +/* + * - Why not use task_lock()? + * User space can randomly change their names anyway, so locking for readers + * doesn't make sense. For writers, locking is probably necessary, as a race + * condition could lead to long-term mixed results. + * The strscpy_pad() in __set_task_comm() can ensure that the task comm is + * always NUL-terminated and zero-padded. Therefore the race condition between + * reader and writer is not an issue. + * + * - BUILD_BUG_ON() can help prevent the buf from being truncated. + * Since the callers don't perform any return value checks, this safeguard is + * necessary. + */ #define get_task_comm(buf, tsk) ({ \ - BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN); \ - __get_task_comm(buf, sizeof(buf), tsk); \ + BUILD_BUG_ON(sizeof(buf) < TASK_COMM_LEN); \ + strscpy_pad(buf, (tsk)->comm); \ + buf; \ }) #ifdef CONFIG_SMP diff --git a/kernel/kthread.c b/kernel/kthread.c index 9bb36897b6c6..a5ac612b1609 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -101,7 +101,7 @@ void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk) struct kthread *kthread = to_kthread(tsk); if (!kthread || !kthread->full_name) { - __get_task_comm(buf, buf_size, tsk); + strscpy(buf, tsk->comm, buf_size); return; } From 286d7a54c8a2f124337a91235199585a35822d94 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 7 Oct 2024 22:49:06 +0800 Subject: [PATCH 10/71] auditsc: replace memcpy() with strscpy() Using strscpy() to read the task comm ensures that the name is always NUL-terminated, regardless of the source string. This approach also facilitates future extensions to the task comm. Link: https://lkml.kernel.org/r/20241007144911.27693-3-laoar.shao@gmail.com Signed-off-by: Yafang Shao Acked-by: Paul Moore Reviewed-by: Justin Stitt Cc: Eric Paris Cc: Alejandro Colomar Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andy Shevchenko Cc: Catalin Marinas Cc: Christian Brauner Cc: Daniel Vetter Cc: David Airlie Cc: Eric Biederman Cc: James Morris Cc: Jan Kara Cc: Kees Cook Cc: Linus Torvalds Cc: Maarten Lankhorst Cc: Matthew Wilcox Cc: Matus Jokay Cc: Maxime Ripard Cc: Ondrej Mosnacek Cc: Quentin Monnet Cc: "Serge E. Hallyn" Cc: Simon Horman Cc: Stephen Smalley Cc: Steven Rostedt (Google) Cc: Tetsuo Handa Cc: Thomas Zimmermann Signed-off-by: Andrew Morton --- kernel/auditsc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index cd57053b4a69..7adc67d5aafb 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2730,7 +2730,7 @@ void __audit_ptrace(struct task_struct *t) context->target_uid = task_uid(t); context->target_sessionid = audit_get_sessionid(t); security_task_getsecid_obj(t, &context->target_sid); - memcpy(context->target_comm, t->comm, TASK_COMM_LEN); + strscpy(context->target_comm, t->comm); } /** @@ -2757,7 +2757,7 @@ int audit_signal_info_syscall(struct task_struct *t) ctx->target_uid = t_uid; ctx->target_sessionid = audit_get_sessionid(t); security_task_getsecid_obj(t, &ctx->target_sid); - memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); + strscpy(ctx->target_comm, t->comm); return 0; } @@ -2778,7 +2778,7 @@ int audit_signal_info_syscall(struct task_struct *t) axp->target_uid[axp->pid_count] = t_uid; axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); security_task_getsecid_obj(t, &axp->target_sid[axp->pid_count]); - memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); + strscpy(axp->target_comm[axp->pid_count], t->comm); axp->pid_count++; return 0; From d4ee4ac395eec1e64f696dbea1de82e90b17127d Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 7 Oct 2024 22:49:07 +0800 Subject: [PATCH 11/71] security: replace memcpy() with get_task_comm() Quoted from Linus [0]: selinux never wanted a lock, and never wanted any kind of *consistent* result, it just wanted a *stable* result. Using get_task_comm() to read the task comm ensures that the name is always NUL-terminated, regardless of the source string. This approach also facilitates future extensions to the task comm. Link: https://lkml.kernel.org/r/20241007144911.27693-4-laoar.shao@gmail.com Signed-off-by: Yafang Shao Link: https://lore.kernel.org/all/CAHk-=wivfrF0_zvf+oj6==Sh=-npJooP8chLPEfaFV0oNYTTBA@mail.gmail.com/ [0] Acked-by: Paul Moore Cc: James Morris Cc: "Serge E. Hallyn" Cc: Stephen Smalley Cc: Ondrej Mosnacek Cc: Alejandro Colomar Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andy Shevchenko Cc: Catalin Marinas Cc: Christian Brauner Cc: Daniel Vetter Cc: David Airlie Cc: Eric Biederman Cc: Eric Paris Cc: Jan Kara Cc: Justin Stitt Cc: Kees Cook Cc: Linus Torvalds Cc: Maarten Lankhorst Cc: Matthew Wilcox Cc: Matus Jokay Cc: Maxime Ripard Cc: Quentin Monnet Cc: Simon Horman Cc: Steven Rostedt (Google) Cc: Tetsuo Handa Cc: Thomas Zimmermann Signed-off-by: Andrew Morton --- security/lsm_audit.c | 4 ++-- security/selinux/selinuxfs.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/security/lsm_audit.c b/security/lsm_audit.c index 849e832719e2..9a8352972086 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -207,7 +207,7 @@ static void dump_common_audit_data(struct audit_buffer *ab, BUILD_BUG_ON(sizeof(a->u) > sizeof(void *)*2); audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current)); - audit_log_untrustedstring(ab, memcpy(comm, current->comm, sizeof(comm))); + audit_log_untrustedstring(ab, get_task_comm(comm, current)); switch (a->type) { case LSM_AUDIT_DATA_NONE: @@ -302,7 +302,7 @@ static void dump_common_audit_data(struct audit_buffer *ab, char comm[sizeof(tsk->comm)]; audit_log_format(ab, " opid=%d ocomm=", pid); audit_log_untrustedstring(ab, - memcpy(comm, tsk->comm, sizeof(comm))); + get_task_comm(comm, tsk)); } } break; diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index e172f182b65c..c9b05be27ddb 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -708,7 +708,7 @@ static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf, if (new_value) { char comm[sizeof(current->comm)]; - memcpy(comm, current->comm, sizeof(comm)); + strscpy(comm, current->comm); pr_err("SELinux: %s (%d) set checkreqprot to 1. This is no longer supported.\n", comm, current->pid); } From d967757d288182522ca263a3d4472101d15a2bfb Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 7 Oct 2024 22:49:08 +0800 Subject: [PATCH 12/71] bpftool: ensure task comm is always NUL-terminated Let's explicitly ensure the destination string is NUL-terminated. This way, it won't be affected by changes to the source string. Link: https://lkml.kernel.org/r/20241007144911.27693-5-laoar.shao@gmail.com Signed-off-by: Yafang Shao Reviewed-by: Quentin Monnet Cc: Alejandro Colomar Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andy Shevchenko Cc: Catalin Marinas Cc: Christian Brauner Cc: Daniel Vetter Cc: David Airlie Cc: Eric Biederman Cc: Eric Paris Cc: James Morris Cc: Jan Kara Cc: Justin Stitt Cc: Kees Cook Cc: Linus Torvalds Cc: Maarten Lankhorst Cc: Matthew Wilcox Cc: Matus Jokay Cc: Maxime Ripard Cc: Ondrej Mosnacek Cc: Paul Moore Cc: "Serge E. Hallyn" Cc: Simon Horman Cc: Stephen Smalley Cc: Steven Rostedt (Google) Cc: Tetsuo Handa Cc: Thomas Zimmermann Signed-off-by: Andrew Morton --- tools/bpf/bpftool/pids.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/bpf/bpftool/pids.c b/tools/bpf/bpftool/pids.c index 9b898571b49e..23f488cf1740 100644 --- a/tools/bpf/bpftool/pids.c +++ b/tools/bpf/bpftool/pids.c @@ -54,6 +54,7 @@ static void add_ref(struct hashmap *map, struct pid_iter_entry *e) ref = &refs->refs[refs->ref_cnt]; ref->pid = e->pid; memcpy(ref->comm, e->comm, sizeof(ref->comm)); + ref->comm[sizeof(ref->comm) - 1] = '\0'; refs->ref_cnt++; return; @@ -77,6 +78,7 @@ static void add_ref(struct hashmap *map, struct pid_iter_entry *e) ref = &refs->refs[0]; ref->pid = e->pid; memcpy(ref->comm, e->comm, sizeof(ref->comm)); + ref->comm[sizeof(ref->comm) - 1] = '\0'; refs->ref_cnt = 1; refs->has_bpf_cookie = e->has_bpf_cookie; refs->bpf_cookie = e->bpf_cookie; From 44ff630170edd89dcdca8a2552b1317fdcc65e51 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 7 Oct 2024 22:49:09 +0800 Subject: [PATCH 13/71] mm/util: fix possible race condition in kstrdup() In kstrdup(), it is critical to ensure that the dest string is always NUL-terminated. However, potential race condition can occur between a writer and a reader. Consider the following scenario involving task->comm: reader writer len = strlen(s) + 1; strlcpy(tsk->comm, buf, sizeof(tsk->comm)); memcpy(buf, s, len); In this case, there is a race condition between the reader and the writer. The reader calculates the length of the string `s` based on the old value of task->comm. However, during the memcpy(), the string `s` might be updated by the writer to a new value of task->comm. If the new task->comm is larger than the old one, the `buf` might not be NUL-terminated. This can lead to undefined behavior and potential security vulnerabilities. Let's fix it by explicitly adding a NUL terminator after the memcpy. It is worth noting that memcpy() is not atomic, so the new string can be shorter when memcpy() already copied past the new NUL. Link: https://lkml.kernel.org/r/20241007144911.27693-6-laoar.shao@gmail.com Signed-off-by: Yafang Shao Cc: Alejandro Colomar Cc: Andy Shevchenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Catalin Marinas Cc: Christian Brauner Cc: Daniel Vetter Cc: David Airlie Cc: Eric Biederman Cc: Eric Paris Cc: James Morris Cc: Jan Kara Cc: Justin Stitt Cc: Kees Cook Cc: Linus Torvalds Cc: Maarten Lankhorst Cc: Matthew Wilcox Cc: Matus Jokay Cc: Maxime Ripard Cc: Ondrej Mosnacek Cc: Paul Moore Cc: Quentin Monnet Cc: "Serge E. Hallyn" Cc: Simon Horman Cc: Stephen Smalley Cc: Steven Rostedt (Google) Cc: Tetsuo Handa Cc: Thomas Zimmermann Signed-off-by: Andrew Morton --- mm/util.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/util.c b/mm/util.c index 4f1275023eb7..858a9a2f57e7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -62,8 +62,15 @@ char *kstrdup(const char *s, gfp_t gfp) len = strlen(s) + 1; buf = kmalloc_track_caller(len, gfp); - if (buf) + if (buf) { memcpy(buf, s, len); + /* + * During memcpy(), the string might be updated to a new value, + * which could be longer than the string when strlen() is + * called. Therefore, we need to add a NUL terminator. + */ + buf[len - 1] = '\0'; + } return buf; } EXPORT_SYMBOL(kstrdup); From 43731516facc3257b514e5c45ae80664b28d3ca3 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 7 Oct 2024 22:49:10 +0800 Subject: [PATCH 14/71] mm/util: deduplicate code in {kstrdup,kstrndup,kmemdup_nul} These three functions follow the same pattern. To deduplicate the code, let's introduce a common helper __kmemdup_nul(). Link: https://lkml.kernel.org/r/20241007144911.27693-7-laoar.shao@gmail.com Suggested-by: Andrew Morton Signed-off-by: Yafang Shao Cc: Simon Horman Cc: Matthew Wilcox Cc: Alejandro Colomar Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andy Shevchenko Cc: Catalin Marinas Cc: Christian Brauner Cc: Daniel Vetter Cc: David Airlie Cc: Eric Biederman Cc: Eric Paris Cc: James Morris Cc: Jan Kara Cc: Justin Stitt Cc: Kees Cook Cc: Linus Torvalds Cc: Maarten Lankhorst Cc: Matus Jokay Cc: Maxime Ripard Cc: Ondrej Mosnacek Cc: Paul Moore Cc: Quentin Monnet Cc: "Serge E. Hallyn" Cc: Stephen Smalley Cc: Steven Rostedt (Google) Cc: Tetsuo Handa Cc: Thomas Zimmermann Signed-off-by: Andrew Morton --- mm/util.c | 69 ++++++++++++++++++++++--------------------------------- 1 file changed, 27 insertions(+), 42 deletions(-) diff --git a/mm/util.c b/mm/util.c index 858a9a2f57e7..c7d851c40843 100644 --- a/mm/util.c +++ b/mm/util.c @@ -44,6 +44,30 @@ void kfree_const(const void *x) } EXPORT_SYMBOL(kfree_const); +/** + * __kmemdup_nul - Create a NUL-terminated string from @s, which might be unterminated. + * @s: The data to copy + * @len: The size of the data, not including the NUL terminator + * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Return: newly allocated copy of @s with NUL-termination or %NULL in + * case of error + */ +static __always_inline char *__kmemdup_nul(const char *s, size_t len, gfp_t gfp) +{ + char *buf; + + /* '+1' for the NUL terminator */ + buf = kmalloc_track_caller(len + 1, gfp); + if (!buf) + return NULL; + + memcpy(buf, s, len); + /* Ensure the buf is always NUL-terminated, regardless of @s. */ + buf[len] = '\0'; + return buf; +} + /** * kstrdup - allocate space for and copy an existing string * @s: the string to duplicate @@ -54,24 +78,7 @@ EXPORT_SYMBOL(kfree_const); noinline char *kstrdup(const char *s, gfp_t gfp) { - size_t len; - char *buf; - - if (!s) - return NULL; - - len = strlen(s) + 1; - buf = kmalloc_track_caller(len, gfp); - if (buf) { - memcpy(buf, s, len); - /* - * During memcpy(), the string might be updated to a new value, - * which could be longer than the string when strlen() is - * called. Therefore, we need to add a NUL terminator. - */ - buf[len - 1] = '\0'; - } - return buf; + return s ? __kmemdup_nul(s, strlen(s), gfp) : NULL; } EXPORT_SYMBOL(kstrdup); @@ -107,19 +114,7 @@ EXPORT_SYMBOL(kstrdup_const); */ char *kstrndup(const char *s, size_t max, gfp_t gfp) { - size_t len; - char *buf; - - if (!s) - return NULL; - - len = strnlen(s, max); - buf = kmalloc_track_caller(len+1, gfp); - if (buf) { - memcpy(buf, s, len); - buf[len] = '\0'; - } - return buf; + return s ? __kmemdup_nul(s, strnlen(s, max), gfp) : NULL; } EXPORT_SYMBOL(kstrndup); @@ -193,17 +188,7 @@ EXPORT_SYMBOL(kvmemdup); */ char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) { - char *buf; - - if (!s) - return NULL; - - buf = kmalloc_track_caller(len + 1, gfp); - if (buf) { - memcpy(buf, s, len); - buf[len] = '\0'; - } - return buf; + return s ? __kmemdup_nul(s, len, gfp) : NULL; } EXPORT_SYMBOL(kmemdup_nul); From 3240aadaccc15d781d1669965ccad230a8c4a175 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 7 Oct 2024 22:49:11 +0800 Subject: [PATCH 15/71] drm: replace strcpy() with strscpy() To prevent errors from occurring when the src string is longer than the dst string in strcpy(), we should use strscpy() instead. This approach also facilitates future extensions to the task comm. Link: https://lkml.kernel.org/r/20241007144911.27693-8-laoar.shao@gmail.com Signed-off-by: Yafang Shao Suggested-by: Justin Stitt Acked-by: Daniel Vetter Reviewed-by: Justin Stitt Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Cc: David Airlie Cc: Alejandro Colomar Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andy Shevchenko Cc: Catalin Marinas Cc: Christian Brauner Cc: Eric Biederman Cc: Eric Paris Cc: James Morris Cc: Jan Kara Cc: Kees Cook Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Matus Jokay Cc: Ondrej Mosnacek Cc: Paul Moore Cc: Quentin Monnet Cc: "Serge E. Hallyn" Cc: Simon Horman Cc: Stephen Smalley Cc: Steven Rostedt (Google) Cc: Tetsuo Handa Signed-off-by: Andrew Morton --- drivers/gpu/drm/drm_framebuffer.c | 2 +- drivers/gpu/drm/i915/i915_gpu_error.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/drm_framebuffer.c b/drivers/gpu/drm/drm_framebuffer.c index 888aadb6a4ac..2d6993539474 100644 --- a/drivers/gpu/drm/drm_framebuffer.c +++ b/drivers/gpu/drm/drm_framebuffer.c @@ -868,7 +868,7 @@ int drm_framebuffer_init(struct drm_device *dev, struct drm_framebuffer *fb, INIT_LIST_HEAD(&fb->filp_head); fb->funcs = funcs; - strcpy(fb->comm, current->comm); + strscpy(fb->comm, current->comm); ret = __drm_mode_object_add(dev, &fb->base, DRM_MODE_OBJECT_FB, false, drm_framebuffer_free); diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 6469b9bcf2ec..9d4b25b2cd39 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -1113,7 +1113,7 @@ i915_vma_coredump_create(const struct intel_gt *gt, } INIT_LIST_HEAD(&dst->page_list); - strcpy(dst->name, name); + strscpy(dst->name, name); dst->next = NULL; dst->gtt_offset = vma_res->start; @@ -1413,7 +1413,7 @@ static bool record_context(struct i915_gem_context_coredump *e, rcu_read_lock(); task = pid_task(ctx->pid, PIDTYPE_PID); if (task) { - strcpy(e->comm, task->comm); + strscpy(e->comm, task->comm); e->pid = task->pid; } rcu_read_unlock(); @@ -1459,7 +1459,7 @@ capture_vma_snapshot(struct intel_engine_capture_vma *next, return next; } - strcpy(c->name, name); + strscpy(c->name, name); c->vma_res = i915_vma_resource_get(vma_res); c->next = next; From b42166427b46af0d963242283fc99d429623d303 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 6 Oct 2024 06:22:21 +0800 Subject: [PATCH 16/71] lib/Kconfig.debug: move int_pow test option to runtime testing section When executing 'make menuconfig' with KUNIT enabled, the int_pow test option appears on the first page of the main menu instead of under the runtime testing section. Relocate the int_pow test configuration to the appropriate runtime testing submenu, ensuring a more organized and logical structure in the menu configuration. Link: https://lkml.kernel.org/r/20241005222221.2154393-1-visitorckw@gmail.com Fixes: 7fcc9b53216c ("lib/math: Add int_pow test suite") Signed-off-by: Kuan-Wei Chiu Cc: Ching-Chun (Jim) Huang Cc: David Gow Cc: Luis Felipe Hernandez Cc: Shuah Khan Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7312ae7c3cc5..409dd193c09b 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2993,6 +2993,22 @@ config TEST_OBJPOOL If unsure, say N. +config INT_POW_TEST + tristate "Integer exponentiation (int_pow) test" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + This option enables the KUnit test suite for the int_pow function, + which performs integer exponentiation. The test suite is designed to + verify that the implementation of int_pow correctly computes the power + of a given base raised to a given exponent. + + Enabling this option will include tests that check various scenarios + and edge cases to ensure the accuracy and reliability of the exponentiation + function. + + If unsure, say N + endif # RUNTIME_TESTING_MENU config ARCH_USE_MEMTEST @@ -3088,19 +3104,3 @@ config RUST_KERNEL_DOCTESTS endmenu # "Rust" endmenu # Kernel hacking - -config INT_POW_TEST - tristate "Integer exponentiation (int_pow) test" if !KUNIT_ALL_TESTS - depends on KUNIT - default KUNIT_ALL_TESTS - help - This option enables the KUnit test suite for the int_pow function, - which performs integer exponentiation. The test suite is designed to - verify that the implementation of int_pow correctly computes the power - of a given base raised to a given exponent. - - Enabling this option will include tests that check various scenarios - and edge cases to ensure the accuracy and reliability of the exponentiation - function. - - If unsure, say N From 5a3c9366cbbf876521f570ce1fb525dc2cb0ed5c Mon Sep 17 00:00:00 2001 From: I Hsin Cheng Date: Tue, 8 Oct 2024 14:52:53 +0800 Subject: [PATCH 17/71] list: test: check the size of every lists for list_cut_position*() Check the total number of elements in both resultant lists are correct within list_cut_position*(). Previously, only the first list's size was checked. so additional elements in the second list would not have been caught. Link: https://lkml.kernel.org/r/20241008065253.26673-1-richard120310@gmail.com Signed-off-by: I Hsin Cheng Cc: David Gow Signed-off-by: Andrew Morton --- lib/list-test.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/list-test.c b/lib/list-test.c index e207c4c98d70..9135cdc1bb39 100644 --- a/lib/list-test.c +++ b/lib/list-test.c @@ -412,6 +412,8 @@ static void list_test_list_cut_position(struct kunit *test) KUNIT_EXPECT_PTR_EQ(test, cur, &entries[i]); i++; } + + KUNIT_EXPECT_EQ(test, i, 3); } static void list_test_list_cut_before(struct kunit *test) @@ -440,6 +442,8 @@ static void list_test_list_cut_before(struct kunit *test) KUNIT_EXPECT_PTR_EQ(test, cur, &entries[i]); i++; } + + KUNIT_EXPECT_EQ(test, i, 3); } static void list_test_list_splice(struct kunit *test) From 834b251b1db6b88b9364955196e5e32746e5ccc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Wed, 9 Oct 2024 15:57:51 +0300 Subject: [PATCH 18/71] resource: correct reallocate_resource() documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit reallocate_resource() documentation claims constraint is about "the size and alignment" but the size is provided in another parameter. Instead of size, constraint has the allowed memory range (min, max) so change the wording to reflect that. Link: https://lkml.kernel.org/r/20241009125751.8090-1-ilpo.jarvinen@linux.intel.com Signed-off-by: Ilpo Järvinen Signed-off-by: Andrew Morton --- kernel/resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/resource.c b/kernel/resource.c index 55bc09f50e21..2d4208b2f62f 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -738,7 +738,7 @@ EXPORT_SYMBOL_GPL(find_resource_space); * @root: root resource descriptor * @old: resource descriptor desired by caller * @newsize: new size of the resource descriptor - * @constraint: the size and alignment constraints to be met. + * @constraint: the memory range and alignment constraints to be met. */ static int reallocate_resource(struct resource *root, struct resource *old, resource_size_t newsize, From f2fa0fd4e7db8326a77618962714924b64f5f889 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 12 Oct 2024 19:52:53 +0200 Subject: [PATCH 19/71] reboot: move reboot_notifier_list to kernel/reboot.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All the functions related to the reboot notifier list are in kernel/reboot.c. Move the list itself, too. As there are no direct users anymore, make the declaration static. Link: https://lkml.kernel.org/r/20241012-reboot_notifier_list-v1-1-6093bb9455ce@weissschuh.net Signed-off-by: Thomas Weißschuh Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton --- include/linux/notifier.h | 2 -- kernel/notifier.c | 8 -------- kernel/reboot.c | 7 +++++++ 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/include/linux/notifier.h b/include/linux/notifier.h index 45702bdcbceb..b42e64734968 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -237,7 +237,5 @@ static inline int notifier_to_errno(int ret) #define KBD_KEYSYM 0x0004 /* Keyboard keysym */ #define KBD_POST_KEYSYM 0x0005 /* Called after keyboard keysym interpretation */ -extern struct blocking_notifier_head reboot_notifier_list; - #endif /* __KERNEL__ */ #endif /* _LINUX_NOTIFIER_H */ diff --git a/kernel/notifier.c b/kernel/notifier.c index b3ce28f39eb6..2f9fe7c30287 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -5,18 +5,10 @@ #include #include #include -#include #define CREATE_TRACE_POINTS #include -/* - * Notifier list for kernel code which wants to be called - * at shutdown. This is used to stop any idling DMA operations - * and the like. - */ -BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); - /* * Notifier chain core routines. The exported routines below * are layered on top of these, with appropriate locking added. diff --git a/kernel/reboot.c b/kernel/reboot.c index f05dbde2c93f..ffdf86b717ab 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -72,6 +72,13 @@ static bool poweroff_fallback_to_halt; */ void __weak (*pm_power_off)(void); +/* + * Notifier list for kernel code which wants to be called + * at shutdown. This is used to stop any idling DMA operations + * and the like. + */ +static BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); + /** * emergency_restart - reboot the system * From a9d38bcd7337f051912174ebfc500e1cef73982e Mon Sep 17 00:00:00 2001 From: Sui Jingfeng Date: Sat, 12 Oct 2024 18:08:17 +0800 Subject: [PATCH 20/71] scatterlist: fix a typo Replace the 'One' with 'On'. Link: https://lkml.kernel.org/r/20241012100817.323007-1-sui.jingfeng@linux.dev Fixes: af2880ec4402 ("scatterlist: add dedicated config for DMA flags") Signed-off-by: Sui Jingfeng Reviewed-by: Petr Tesarik Cc: Catalin Marinas Cc: Michael Kelley Cc: Robin Murphy Signed-off-by: Andrew Morton --- include/linux/scatterlist.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index e61d164622db..c5e2239b550e 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -273,7 +273,7 @@ static inline void sg_unmark_end(struct scatterlist *sg) } /* - * One 64-bit architectures there is a 4-byte padding in struct scatterlist + * On 64-bit architectures there is a 4-byte padding in struct scatterlist * (assuming also CONFIG_NEED_SG_DMA_LENGTH is set). Use this padding for DMA * flags bits to indicate when a specific dma address is a bus address or the * buffer may have been bounced via SWIOTLB. From 5d042707089f0d0c49473d05250e4f319a71e1df Mon Sep 17 00:00:00 2001 From: Vinicius Peixoto Date: Sat, 12 Oct 2024 04:43:49 -0300 Subject: [PATCH 21/71] lib/crc16_kunit.c: add KUnit tests for crc16 Add Kunit tests for the kernel's implementation of the standard CRC-16 algorithm (). The test data consists of 100 randomly-generated test cases, validated against a naive CRC-16 implementation. This test follows roughly the same logic as lib/crc32test.c, but without the performance measurements. Link: https://lkml.kernel.org/r/20241012-crc16-kunit-v3-1-0ca75cb58ca9@lkcamp.dev Signed-off-by: Vinicius Peixoto Co-developed-by: Enzo Bertoloti Signed-off-by: Enzo Bertoloti Co-developed-by: Fabricio Gasperin Signed-off-by: Fabricio Gasperin Suggested-by: David Laight Cc: Brendan Higgins Cc: David Gow Cc: Rae Moar Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 9 +++ lib/Makefile | 1 + lib/crc16_kunit.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 165 insertions(+) create mode 100644 lib/crc16_kunit.c diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 409dd193c09b..eda319e9d569 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2850,6 +2850,15 @@ config USERCOPY_KUNIT_TEST on the copy_to/from_user infrastructure, making sure basic user/kernel boundary testing is working. +config CRC16_KUNIT_TEST + tristate "KUnit tests for CRC16" + depends on KUNIT + default KUNIT_ALL_TESTS + select CRC16 + help + Enable this option to run unit tests for the kernel's CRC16 + implementation (). + config TEST_UDELAY tristate "udelay test driver" help diff --git a/lib/Makefile b/lib/Makefile index 773adf88af41..1faed6414a85 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -389,6 +389,7 @@ CFLAGS_fortify_kunit.o += $(DISABLE_STRUCTLEAK_PLUGIN) obj-$(CONFIG_FORTIFY_KUNIT_TEST) += fortify_kunit.o obj-$(CONFIG_SIPHASH_KUNIT_TEST) += siphash_kunit.o obj-$(CONFIG_USERCOPY_KUNIT_TEST) += usercopy_kunit.o +obj-$(CONFIG_CRC16_KUNIT_TEST) += crc16_kunit.o obj-$(CONFIG_GENERIC_LIB_DEVMEM_IS_ALLOWED) += devmem_is_allowed.o diff --git a/lib/crc16_kunit.c b/lib/crc16_kunit.c new file mode 100644 index 000000000000..0918c98a96d2 --- /dev/null +++ b/lib/crc16_kunit.c @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnits tests for CRC16. + * + * Copyright (C) 2024, LKCAMP + * Author: Vinicius Peixoto + * Author: Fabricio Gasperin + * Author: Enzo Bertoloti + */ +#include +#include +#include + +#define CRC16_KUNIT_DATA_SIZE 4096 +#define CRC16_KUNIT_TEST_SIZE 100 +#define CRC16_KUNIT_SEED 0x12345678 + +/** + * struct crc16_test - CRC16 test data + * @crc: initial input value to CRC16 + * @start: Start index within the data buffer + * @length: Length of the data + */ +static struct crc16_test { + u16 crc; + u16 start; + u16 length; +} tests[CRC16_KUNIT_TEST_SIZE]; + +u8 data[CRC16_KUNIT_DATA_SIZE]; + + +/* Naive implementation of CRC16 for validation purposes */ +static inline u16 _crc16_naive_byte(u16 crc, u8 data) +{ + u8 i = 0; + + crc ^= (u16) data; + for (i = 0; i < 8; i++) { + if (crc & 0x01) + crc = (crc >> 1) ^ 0xa001; + else + crc = crc >> 1; + } + + return crc; +} + + +static inline u16 _crc16_naive(u16 crc, u8 *buffer, size_t len) +{ + while (len--) + crc = _crc16_naive_byte(crc, *buffer++); + return crc; +} + + +/* Small helper for generating pseudorandom 16-bit data */ +static inline u16 _rand16(void) +{ + static u32 rand = CRC16_KUNIT_SEED; + + rand = next_pseudo_random32(rand); + return rand & 0xFFFF; +} + + +static int crc16_init_test_data(struct kunit_suite *suite) +{ + size_t i; + + /* Fill the data buffer with random bytes */ + for (i = 0; i < CRC16_KUNIT_DATA_SIZE; i++) + data[i] = _rand16() & 0xFF; + + /* Generate random test data while ensuring the random + * start + length values won't overflow the 4096-byte + * buffer (0x7FF * 2 = 0xFFE < 0x1000) + */ + for (size_t i = 0; i < CRC16_KUNIT_TEST_SIZE; i++) { + tests[i].crc = _rand16(); + tests[i].start = _rand16() & 0x7FF; + tests[i].length = _rand16() & 0x7FF; + } + + return 0; +} + +static void crc16_test_empty(struct kunit *test) +{ + u16 crc; + + /* The result for empty data should be the same as the + * initial crc + */ + crc = crc16(0x00, data, 0); + KUNIT_EXPECT_EQ(test, crc, 0); + crc = crc16(0xFF, data, 0); + KUNIT_EXPECT_EQ(test, crc, 0xFF); +} + +static void crc16_test_correctness(struct kunit *test) +{ + size_t i; + u16 crc, crc_naive; + + for (i = 0; i < CRC16_KUNIT_TEST_SIZE; i++) { + /* Compare results with the naive crc16 implementation */ + crc = crc16(tests[i].crc, data + tests[i].start, + tests[i].length); + crc_naive = _crc16_naive(tests[i].crc, data + tests[i].start, + tests[i].length); + KUNIT_EXPECT_EQ(test, crc, crc_naive); + } +} + + +static void crc16_test_combine(struct kunit *test) +{ + size_t i, j; + u16 crc, crc_naive; + + /* Make sure that combining two consecutive crc16 calculations + * yields the same result as calculating the crc16 for the whole thing + */ + for (i = 0; i < CRC16_KUNIT_TEST_SIZE; i++) { + crc_naive = crc16(tests[i].crc, data + tests[i].start, tests[i].length); + for (j = 0; j < tests[i].length; j++) { + crc = crc16(tests[i].crc, data + tests[i].start, j); + crc = crc16(crc, data + tests[i].start + j, tests[i].length - j); + KUNIT_EXPECT_EQ(test, crc, crc_naive); + } + } +} + + +static struct kunit_case crc16_test_cases[] = { + KUNIT_CASE(crc16_test_empty), + KUNIT_CASE(crc16_test_combine), + KUNIT_CASE(crc16_test_correctness), + {}, +}; + +static struct kunit_suite crc16_test_suite = { + .name = "crc16", + .test_cases = crc16_test_cases, + .suite_init = crc16_init_test_data, +}; +kunit_test_suite(crc16_test_suite); + +MODULE_AUTHOR("Fabricio Gasperin "); +MODULE_AUTHOR("Vinicius Peixoto "); +MODULE_AUTHOR("Enzo Bertoloti "); +MODULE_DESCRIPTION("Unit tests for crc16"); +MODULE_LICENSE("GPL"); From 8801c35c3672c8492824f5d3c4d3b37f43ed63c3 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Fri, 11 Oct 2024 16:51:55 -0600 Subject: [PATCH 22/71] tools: fix -Wunused-result in linux.c Fix the following -Wunused-result warnings on posix_memalign() return values and add error handling. ./shared/linux.c:100:25: warning: ignoring return value of `posix_memalign' declared with attribute `warn_unused_result' [-Wunused-result] 100 | posix_memalign(&p, cachep->align, cachep->size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../shared/linux.c: In function `kmem_cache_alloc_bulk': ../shared/linux.c:198:33: warning: ignoring return value of `posix_memalign' declared with attribute `warn_unused_result' [-Wunused-result] 198 | posix_memalign(&p[i], cachep->align, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 199 | cachep->size); | ~~~~~~~~~~~~~ Link: https://lkml.kernel.org/r/20241011225155.27607-1-skhan@linuxfoundation.org Signed-off-by: Shuah Khan Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Sidhartha Kumar Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/shared/linux.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tools/testing/shared/linux.c b/tools/testing/shared/linux.c index 17263696b5d8..66dbb362385f 100644 --- a/tools/testing/shared/linux.c +++ b/tools/testing/shared/linux.c @@ -96,10 +96,13 @@ void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, p = node; } else { pthread_mutex_unlock(&cachep->lock); - if (cachep->align) - posix_memalign(&p, cachep->align, cachep->size); - else + if (cachep->align) { + if (posix_memalign(&p, cachep->align, cachep->size) < 0) + return NULL; + } else { p = malloc(cachep->size); + } + if (cachep->ctor) cachep->ctor(p); else if (gfp & __GFP_ZERO) @@ -195,8 +198,9 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, } if (cachep->align) { - posix_memalign(&p[i], cachep->align, - cachep->size); + if (posix_memalign(&p[i], cachep->align, + cachep->size) < 0) + break; } else { p[i] = malloc(cachep->size); if (!p[i]) From bf9850f6ea3577a099b0ed43f6e19ca43ef08704 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Fri, 11 Oct 2024 22:12:14 +0800 Subject: [PATCH 23/71] lib/Makefile: make union-find compilation conditional on CONFIG_CPUSETS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, cpuset is the only user of the union-find implementation. Compiling union-find in all configurations unnecessarily increases the code size when building the kernel without cgroup support. Modify the build system to compile union-find only when CONFIG_CPUSETS is enabled. Link: https://lore.kernel.org/lkml/1ccd6411-5002-4574-bb8e-3e64bba6a757@redhat.com/ Link: https://lkml.kernel.org/r/20241011141214.87096-1-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Suggested-by: Waiman Long Acked-by: Waiman Long Acked-by: Tejun Heo Reviewed-by: Christoph Hellwig Cc: Ching-Chun (Jim) Huang Cc: Johannes Weiner Cc: Michal Koutný Cc: Xavier Cc: Zefan Li Signed-off-by: Andrew Morton --- init/Kconfig | 1 + lib/Kconfig | 3 +++ lib/Makefile | 3 ++- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index c521e1421ad4..5cb9b9490f30 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1157,6 +1157,7 @@ config CGROUP_HUGETLB config CPUSETS bool "Cpuset controller" depends on SMP + select UNION_FIND help This option will let you create and manage CPUSETs which allow dynamically partitioning a system into sets of CPUs and diff --git a/lib/Kconfig b/lib/Kconfig index b38849af6f13..cf303bd91dda 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -777,3 +777,6 @@ config POLYNOMIAL config FIRMWARE_TABLE bool + +config UNION_FIND + bool diff --git a/lib/Makefile b/lib/Makefile index 1faed6414a85..feebed74fc7a 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -35,8 +35,9 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o siphash.o dec_and_lock.o \ nmi_backtrace.o win_minmax.o memcat_p.o \ - buildid.o objpool.o union_find.o + buildid.o objpool.o +lib-$(CONFIG_UNION_FIND) += union_find.o lib-$(CONFIG_PRINTK) += dump_stack.o lib-$(CONFIG_SMP) += cpumask.o From 1bb5d6609767b631526a95446198e5f436159bea Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 14 Oct 2024 03:02:10 -0700 Subject: [PATCH 24/71] scripts/decode_stacktrace.sh: remove trailing space decode_stacktrace.sh adds a trailing space at the end of the decoded stack if the module is not set (in most of the lines), which makes the some lines of the stack having trailing space and some others not. Do not add an extra space at the end of the line if module is not set, adding consistency in output formatting. Link: https://lkml.kernel.org/r/20241014100213.1873611-1-leitao@debian.org Signed-off-by: Breno Leitao Reviewed-by: Elliot Berman Reviewed-by: Carlos Llamas Reviewed-by: Stephen Boyd Cc: Bjorn Andersson Cc: Luca Ceresoli Cc: Xiong Nandi Signed-off-by: Andrew Morton --- scripts/decode_stacktrace.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh index 826836d264c6..46fa18b80fc1 100755 --- a/scripts/decode_stacktrace.sh +++ b/scripts/decode_stacktrace.sh @@ -311,7 +311,12 @@ handle_line() { parse_symbol # modifies $symbol # Add up the line number to the symbol - echo "${words[@]}" "$symbol $module" + if [[ -z ${module} ]] + then + echo "${words[@]}" "$symbol" + else + echo "${words[@]}" "$symbol $module" + fi } while read line; do From ad8f63f935b6785c87681d35b9408f5ecd5db967 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 24 Sep 2024 11:07:13 +0200 Subject: [PATCH 25/71] perf/hw_breakpoint: use ERR_PTR_PCPU(), IS_ERR_PCPU() and PTR_ERR_PCPU() macros Use ERR_PTR_PCPU() when returning error pointer in the percpu address space. Use IS_ERR_PCPU() and PTR_ERR_PCPU() when returning the error pointer from the percpu address space. These macros add intermediate cast to unsigned long when switching named address spaces. The patch will avoid future build errors due to pointer address space mismatch with enabled strict percpu address space checks. Link: https://lkml.kernel.org/r/20240924090813.1353586-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Cc: Mark Rutland Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Ian Rogers Cc: Adrian Hunter Cc: "Liang, Kan" Signed-off-by: Andrew Morton --- kernel/events/hw_breakpoint.c | 4 ++-- samples/hw_breakpoint/data_breakpoint.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 6c2cb4e4f48d..bc4a61029b6d 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -849,7 +849,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, cpu_events = alloc_percpu(typeof(*cpu_events)); if (!cpu_events) - return (void __percpu __force *)ERR_PTR(-ENOMEM); + return ERR_PTR_PCPU(-ENOMEM); cpus_read_lock(); for_each_online_cpu(cpu) { @@ -868,7 +868,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, return cpu_events; unregister_wide_hw_breakpoint(cpu_events); - return (void __percpu __force *)ERR_PTR(err); + return ERR_PTR_PCPU(err); } EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c index a2c831e89ce0..fbb03b66dcbd 100644 --- a/samples/hw_breakpoint/data_breakpoint.c +++ b/samples/hw_breakpoint/data_breakpoint.c @@ -52,8 +52,8 @@ static int __init hw_break_module_init(void) attr.bp_type = HW_BREAKPOINT_W; sample_hbp = register_wide_hw_breakpoint(&attr, sample_hbp_handler, NULL); - if (IS_ERR((void __force *)sample_hbp)) { - ret = PTR_ERR((void __force *)sample_hbp); + if (IS_ERR_PCPU(sample_hbp)) { + ret = PTR_ERR_PCPU(sample_hbp); goto fail; } From f3adb88e6c0b50e18dab3d58b1d6c5abd35dfcf7 Mon Sep 17 00:00:00 2001 From: WangYuli Date: Fri, 18 Oct 2024 10:47:19 +0800 Subject: [PATCH 26/71] scripts/spelling.txt: add typo "exprienced" and "rewritting" Add typo "exprienced" and "rewritting". They were found and fixed in follow patches: Link: https://lore.kernel.org/all/90D42CB167CA0842+20241018021910.31359-1-wangyuli@uniontech.com/ Link: https://lore.kernel.org/all/45F06B5D4CA9F444+20241018023340.47617-1-wangyuli@uniontech.com/ Link: https://lkml.kernel.org/r/C1FE2459CF066CA5+20241018024719.58325-1-wangyuli@uniontech.com Link: https://lore.kernel.org/all/20241017162846.GA51712@kernel.org/ Signed-off-by: WangYuli Suggested-by: Simon Horman Reviewed-by: Simon Horman Cc: Colin Ian King Cc: Jesse Brandeburg Cc: WangYuli Signed-off-by: Andrew Morton --- scripts/spelling.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index fe25bf3b605a..05bd9ca1fbfa 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -674,6 +674,7 @@ explict||explicit explictely||explicitly explictly||explicitly expresion||expression +exprienced||experienced exprimental||experimental extened||extended exteneded||extended @@ -1388,6 +1389,7 @@ reuest||request reuqest||request reutnred||returned revsion||revision +rewritting||rewriting rmeoved||removed rmeove||remove rmeoves||removes From bc8f5921cd69188627c08041276238de222ab466 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Wed, 23 Oct 2024 17:31:29 +0800 Subject: [PATCH 27/71] ipc: fix memleak if msg_init_ns failed in create_ipc_ns Percpu memory allocation may failed during create_ipc_ns however this fail is not handled properly since ipc sysctls and mq sysctls is not released properly. Fix this by release these two resource when failure. Here is the kmemleak stack when percpu failed: unreferenced object 0xffff88819de2a600 (size 512): comm "shmem_2nstest", pid 120711, jiffies 4300542254 hex dump (first 32 bytes): 60 aa 9d 84 ff ff ff ff fc 18 48 b2 84 88 ff ff `.........H..... 04 00 00 00 a4 01 00 00 20 e4 56 81 ff ff ff ff ........ .V..... backtrace (crc be7cba35): [] __kmalloc_node_track_caller_noprof+0x333/0x420 [] kmemdup_noprof+0x26/0x50 [] setup_mq_sysctls+0x57/0x1d0 [] copy_ipcs+0x29c/0x3b0 [] create_new_namespaces+0x1d0/0x920 [] copy_namespaces+0x2e9/0x3e0 [] copy_process+0x29f3/0x7ff0 [] kernel_clone+0xc0/0x650 [] __do_sys_clone+0xa1/0xe0 [] do_syscall_64+0xbf/0x1c0 [] entry_SYSCALL_64_after_hwframe+0x4b/0x53 Link: https://lkml.kernel.org/r/20241023093129.3074301-1-mawupeng1@huawei.com Fixes: 72d1e611082e ("ipc/msg: mitigate the lock contention with percpu counter") Signed-off-by: Ma Wupeng Cc: Signed-off-by: Andrew Morton --- ipc/namespace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ipc/namespace.c b/ipc/namespace.c index 6ecc30effd3e..4df91ceeeafe 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -83,13 +83,15 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, err = msg_init_ns(ns); if (err) - goto fail_put; + goto fail_ipc; sem_init_ns(ns); shm_init_ns(ns); return ns; +fail_ipc: + retire_ipc_sysctls(ns); fail_mq: retire_mq_sysctls(ns); From 908ef9bb4bd36837c3619109bdcf58f6ab00bfc7 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sat, 12 Oct 2024 12:28:26 +0800 Subject: [PATCH 28/71] lib/list_sort: remove unnecessary header includes Patch series "Remove unnecessary header includes from {tools/}lib/list_sort.c". Remove outdated and unnecessary header includes from lib/list_sort.c and tools/lib/list_sort.c. Additionally, update the hunk exceptions checked by check_headers.sh to reflect these changes. This patch (of 3): After commit 043b3f7b6388 ("lib/list_sort: simplify and remove MAX_LIST_LENGTH_BITS"), list_sort.c no longer uses ARRAY_SIZE() (which required kernel.h and bug.h for BUILD_BUG_ON_ZERO via __must_be_array) or memset() (which required string.h). As these headers are no longer needed, removes them. There are no changes to the generated code, as confirmed by 'objdump -d'. Additionally, 'wc -l' shows that the size of lib/.list_sort.o.cmd is reduced from 259 lines to 101 lines. Link: https://lkml.kernel.org/r/20241012042828.471614-1-visitorckw@gmail.com Link: https://lkml.kernel.org/r/20241012042828.471614-2-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Ching-Chun (Jim) Huang Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: "Liang, Kan" Cc: Mark Rutland Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- lib/list_sort.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/lib/list_sort.c b/lib/list_sort.c index 0fb59e92ca2d..8d3f623536fe 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -1,9 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include #include #include -#include #include #include From ff1a39c3f86c4d998630645f6778a622a993fb8b Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sat, 12 Oct 2024 12:28:27 +0800 Subject: [PATCH 29/71] tools/lib/list_sort: remove unnecessary header includes Since lib/list_sort.c no longer requires ARRAY_SIZE() and memset(), the includes for kernel.h, bug.h, and string.h have been removed. Similarly, tools/lib/list_sort.c also does not need to include these headers, so they have been removed as well. Link: https://lkml.kernel.org/r/20241012042828.471614-3-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Ching-Chun (Jim) Huang Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: "Liang, Kan" Cc: Mark Rutland Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- tools/lib/list_sort.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/lib/list_sort.c b/tools/lib/list_sort.c index 69affa251fa7..bb99e493dcd1 100644 --- a/tools/lib/list_sort.c +++ b/tools/lib/list_sort.c @@ -1,8 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include #include #include -#include #include #include From 8f0d91f41000e769f16b62a4b44f1f6da6db905b Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sat, 12 Oct 2024 12:28:28 +0800 Subject: [PATCH 30/71] perf tools: update expected diff for lib/list_sort.c Since there are no longer any header include differences between lib/list_sort.c and tools/lib/list_sort.c, update the expected diff in check-header_ignore_hunks accordingly. Link: https://lkml.kernel.org/r/20241012042828.471614-4-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Ching-Chun (Jim) Huang Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: "Liang, Kan" Cc: Mark Rutland Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- tools/perf/check-header_ignore_hunks/lib/list_sort.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/tools/perf/check-header_ignore_hunks/lib/list_sort.c b/tools/perf/check-header_ignore_hunks/lib/list_sort.c index 32d98cb34f80..b7316d29857d 100644 --- a/tools/perf/check-header_ignore_hunks/lib/list_sort.c +++ b/tools/perf/check-header_ignore_hunks/lib/list_sort.c @@ -1,11 +1,4 @@ -@@ -1,5 +1,6 @@ - // SPDX-License-Identifier: GPL-2.0 - #include -+#include - #include - #include - #include -@@ -52,6 +53,7 @@ +@@ -50,6 +50,7 @@ struct list_head *a, struct list_head *b) { struct list_head *tail = head; @@ -13,7 +6,7 @@ for (;;) { /* if equal, take 'a' -- important for sort stability */ -@@ -77,6 +79,15 @@ +@@ -75,6 +76,15 @@ /* Finish linking remainder of list b on to tail */ tail->next = b; do { From 74ef070e325465a1b364db6a5c6859785537f835 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 21 Oct 2024 10:07:36 +0200 Subject: [PATCH 31/71] percpu: merge VERIFY_PERCPU_PTR() into its only user Merge VERIFY_PERCPU_PTR() into non-CONFIG_SMP per_cpu_ptr() to make macro similar to CONFIG_SMP per_cpu_ptr(). This will allow a follow-up patch to refactor common code to a macro. No functional changes, non-CONFIG_SMP per_cpu_ptr() was the only user of VERIFY_PERCPU_PTR(). Link: https://lkml.kernel.org/r/20241021080856.48746-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Acked-by: Christoph Lameter Cc: Dennis Zhou Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/percpu-defs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 8efce7414fad..7fa88c5f4b26 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -254,13 +254,13 @@ do { \ #else /* CONFIG_SMP */ -#define VERIFY_PERCPU_PTR(__p) \ +#define per_cpu_ptr(ptr, cpu) \ ({ \ - __verify_pcpu_ptr(__p); \ - (typeof(*(__p)) __kernel __force *)(__p); \ + (void)(cpu); \ + __verify_pcpu_ptr(ptr); \ + (typeof(*(ptr)) __kernel __force *)(ptr); \ }) -#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); VERIFY_PERCPU_PTR(ptr); }) #define raw_cpu_ptr(ptr) per_cpu_ptr(ptr, 0) #define this_cpu_ptr(ptr) raw_cpu_ptr(ptr) From 001217defda86d0d6a5a9e6cf77a6b813857e7e3 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 21 Oct 2024 10:07:37 +0200 Subject: [PATCH 32/71] percpu: introduce PERCPU_PTR() macro Introduce PERCPU_PTR() macro to cast the percpu pointer from the percpu address space to a generic (kernel) address space. Use it in per_cpu_ptr() and related SHIFT_PERCPU_PTR() macros. Also remove common knowledge from SHIFT_PERCPU_PTR() comment, "weird cast" is just a standard way to inform sparse of a cast from the percpu address space to a generic address space. Link: https://lkml.kernel.org/r/20241021080856.48746-2-ubizjak@gmail.com Signed-off-by: Uros Bizjak Acked-by: Christoph Lameter Cc: Dennis Zhou Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/percpu-defs.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 7fa88c5f4b26..e1cf7982424f 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -220,15 +220,17 @@ do { \ (void)__vpp_verify; \ } while (0) +#define PERCPU_PTR(__p) \ + (typeof(*(__p)) __force __kernel *)(__p); + #ifdef CONFIG_SMP /* - * Add an offset to a pointer but keep the pointer as-is. Use RELOC_HIDE() - * to prevent the compiler from making incorrect assumptions about the - * pointer value. The weird cast keeps both GCC and sparse happy. + * Add an offset to a pointer. Use RELOC_HIDE() to prevent the compiler + * from making incorrect assumptions about the pointer value. */ #define SHIFT_PERCPU_PTR(__p, __offset) \ - RELOC_HIDE((typeof(*(__p)) __kernel __force *)(__p), (__offset)) + RELOC_HIDE(PERCPU_PTR(__p), (__offset)) #define per_cpu_ptr(ptr, cpu) \ ({ \ @@ -258,7 +260,7 @@ do { \ ({ \ (void)(cpu); \ __verify_pcpu_ptr(ptr); \ - (typeof(*(ptr)) __kernel __force *)(ptr); \ + PERCPU_PTR(ptr); \ }) #define raw_cpu_ptr(ptr) per_cpu_ptr(ptr, 0) From dabddd687c9e1a06241d6b4d1f66b9f2b60b3ad1 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 21 Oct 2024 10:07:38 +0200 Subject: [PATCH 33/71] percpu: cast percpu pointer in PERCPU_PTR() via unsigned long Cast pointer from percpu address space to generic (kernel) address space in PERCPU_PTR() macro via unsigned long intermediate cast [1]. This intermediate cast is also required to avoid build failure when GCC's strict named address space checks for x86 targets [2] are enabled. Found by GCC's named address space checks. [1] https://sparse.docs.kernel.org/en/latest/annotations.html#address-space-name [2] https://gcc.gnu.org/onlinedocs/gcc/Named-Address-Spaces.html#x86-Named-Address-Spaces Link: https://lkml.kernel.org/r/20241021080856.48746-3-ubizjak@gmail.com Signed-off-by: Uros Bizjak Acked-by: Christoph Lameter Cc: Dennis Zhou Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/percpu-defs.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index e1cf7982424f..35842d1e3879 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -221,7 +221,10 @@ do { \ } while (0) #define PERCPU_PTR(__p) \ - (typeof(*(__p)) __force __kernel *)(__p); +({ \ + unsigned long __pcpu_ptr = (__force unsigned long)(__p); \ + (typeof(*(__p)) __force __kernel *)(__pcpu_ptr); \ +}) #ifdef CONFIG_SMP From 92a8b224b833e82d286d2100432adbac8cf8a2a1 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 20 Oct 2024 12:01:51 +0800 Subject: [PATCH 34/71] lib/min_heap: introduce non-inline versions of min heap API functions Patch series "Enhance min heap API with non-inline functions and optimizations", v2. Add non-inline versions of the min heap API functions in lib/min_heap.c and updates all users outside of kernel/events/core.c to use these non-inline versions. To mitigate the performance impact of indirect function calls caused by the non-inline versions of the swap and compare functions, a builtin swap has been introduced that swaps elements based on their size. Additionally, it micro-optimizes the efficiency of the min heap by pre-scaling the counter, following the same approach as in lib/sort.c. Documentation for the min heap API has also been added to the core-api section. This patch (of 10): All current min heap API functions are marked with '__always_inline'. However, as the number of users increases, inlining these functions everywhere leads to a increase in kernel size. In performance-critical paths, such as when perf events are enabled and min heap functions are called on every context switch, it is important to retain the inline versions for optimal performance. To balance this, the original inline functions are kept, and additional non-inline versions of the functions have been added in lib/min_heap.c. Link: https://lkml.kernel.org/r/20241020040200.939973-1-visitorckw@gmail.com Link: https://lore.kernel.org/20240522161048.8d8bbc7b153b4ecd92c50666@linux-foundation.org Link: https://lkml.kernel.org/r/20241020040200.939973-2-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Suggested-by: Andrew Morton Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Kent Overstreet Cc: Kuan-Wei Chiu Cc: "Liang, Kan" Cc: Mark Rutland Cc: Matthew Sakai Cc: Matthew Wilcox (Oracle) Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- drivers/md/bcache/Kconfig | 1 + drivers/md/dm-vdo/Kconfig | 1 + fs/bcachefs/Kconfig | 1 + include/linux/min_heap.h | 129 +++++++++++++++++++++++++------------- kernel/events/core.c | 6 +- lib/Kconfig | 3 + lib/Kconfig.debug | 1 + lib/Makefile | 1 + lib/min_heap.c | 70 +++++++++++++++++++++ 9 files changed, 167 insertions(+), 46 deletions(-) create mode 100644 lib/min_heap.c diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index b2d10063d35f..d4697e79d5a3 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -5,6 +5,7 @@ config BCACHE select BLOCK_HOLDER_DEPRECATED if SYSFS select CRC64 select CLOSURES + select MIN_HEAP help Allows a block device to be used as cache for other devices; uses a btree for indexing and the layout is optimized for SSDs. diff --git a/drivers/md/dm-vdo/Kconfig b/drivers/md/dm-vdo/Kconfig index 111ecd2c2a24..2400b2bc4bc7 100644 --- a/drivers/md/dm-vdo/Kconfig +++ b/drivers/md/dm-vdo/Kconfig @@ -7,6 +7,7 @@ config DM_VDO select DM_BUFIO select LZ4_COMPRESS select LZ4_DECOMPRESS + select MIN_HEAP help This device mapper target presents a block device with deduplication, compression and thin-provisioning. diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index 5bac803ea367..ab6c95b895b3 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -24,6 +24,7 @@ config BCACHEFS_FS select XXHASH select SRCU select SYMBOLIC_ERRNAME + select MIN_HEAP help The bcachefs filesystem - a modern, copy on write filesystem, with support for multiple devices, compression, checksumming, etc. diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index 43a7b9dcf15e..0abb21173979 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -40,7 +40,7 @@ struct min_heap_callbacks { /* Initialize a min-heap. */ static __always_inline -void __min_heap_init(min_heap_char *heap, void *data, int size) +void __min_heap_init_inline(min_heap_char *heap, void *data, int size) { heap->nr = 0; heap->size = size; @@ -50,33 +50,33 @@ void __min_heap_init(min_heap_char *heap, void *data, int size) heap->data = heap->preallocated; } -#define min_heap_init(_heap, _data, _size) \ - __min_heap_init((min_heap_char *)_heap, _data, _size) +#define min_heap_init_inline(_heap, _data, _size) \ + __min_heap_init_inline((min_heap_char *)_heap, _data, _size) /* Get the minimum element from the heap. */ static __always_inline -void *__min_heap_peek(struct min_heap_char *heap) +void *__min_heap_peek_inline(struct min_heap_char *heap) { return heap->nr ? heap->data : NULL; } -#define min_heap_peek(_heap) \ - (__minheap_cast(_heap) __min_heap_peek((min_heap_char *)_heap)) +#define min_heap_peek_inline(_heap) \ + (__minheap_cast(_heap) __min_heap_peek_inline((min_heap_char *)_heap)) /* Check if the heap is full. */ static __always_inline -bool __min_heap_full(min_heap_char *heap) +bool __min_heap_full_inline(min_heap_char *heap) { return heap->nr == heap->size; } -#define min_heap_full(_heap) \ - __min_heap_full((min_heap_char *)_heap) +#define min_heap_full_inline(_heap) \ + __min_heap_full_inline((min_heap_char *)_heap) /* Sift the element at pos down the heap. */ static __always_inline -void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size, - const struct min_heap_callbacks *func, void *args) +void __min_heap_sift_down_inline(min_heap_char *heap, int pos, size_t elem_size, + const struct min_heap_callbacks *func, void *args) { void *left, *right; void *data = heap->data; @@ -108,13 +108,14 @@ void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size, } } -#define min_heap_sift_down(_heap, _pos, _func, _args) \ - __min_heap_sift_down((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), _func, _args) +#define min_heap_sift_down_inline(_heap, _pos, _func, _args) \ + __min_heap_sift_down_inline((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), \ + _func, _args) /* Sift up ith element from the heap, O(log2(nr)). */ static __always_inline -void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx, - const struct min_heap_callbacks *func, void *args) +void __min_heap_sift_up_inline(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args) { void *data = heap->data; size_t parent; @@ -128,27 +129,28 @@ void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx, } } -#define min_heap_sift_up(_heap, _idx, _func, _args) \ - __min_heap_sift_up((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args) +#define min_heap_sift_up_inline(_heap, _idx, _func, _args) \ + __min_heap_sift_up_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, \ + _func, _args) /* Floyd's approach to heapification that is O(nr). */ static __always_inline -void __min_heapify_all(min_heap_char *heap, size_t elem_size, - const struct min_heap_callbacks *func, void *args) +void __min_heapify_all_inline(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args) { int i; for (i = heap->nr / 2 - 1; i >= 0; i--) - __min_heap_sift_down(heap, i, elem_size, func, args); + __min_heap_sift_down_inline(heap, i, elem_size, func, args); } -#define min_heapify_all(_heap, _func, _args) \ - __min_heapify_all((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) +#define min_heapify_all_inline(_heap, _func, _args) \ + __min_heapify_all_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) /* Remove minimum element from the heap, O(log2(nr)). */ static __always_inline -bool __min_heap_pop(min_heap_char *heap, size_t elem_size, - const struct min_heap_callbacks *func, void *args) +bool __min_heap_pop_inline(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args) { void *data = heap->data; @@ -158,13 +160,13 @@ bool __min_heap_pop(min_heap_char *heap, size_t elem_size, /* Place last element at the root (position 0) and then sift down. */ heap->nr--; memcpy(data, data + (heap->nr * elem_size), elem_size); - __min_heap_sift_down(heap, 0, elem_size, func, args); + __min_heap_sift_down_inline(heap, 0, elem_size, func, args); return true; } -#define min_heap_pop(_heap, _func, _args) \ - __min_heap_pop((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) +#define min_heap_pop_inline(_heap, _func, _args) \ + __min_heap_pop_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) /* * Remove the minimum element and then push the given element. The @@ -172,22 +174,21 @@ bool __min_heap_pop(min_heap_char *heap, size_t elem_size, * efficient than a pop followed by a push that does 2. */ static __always_inline -void __min_heap_pop_push(min_heap_char *heap, - const void *element, size_t elem_size, - const struct min_heap_callbacks *func, - void *args) +void __min_heap_pop_push_inline(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args) { memcpy(heap->data, element, elem_size); - __min_heap_sift_down(heap, 0, elem_size, func, args); + __min_heap_sift_down_inline(heap, 0, elem_size, func, args); } -#define min_heap_pop_push(_heap, _element, _func, _args) \ - __min_heap_pop_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args) +#define min_heap_pop_push_inline(_heap, _element, _func, _args) \ + __min_heap_pop_push_inline((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), \ + _func, _args) /* Push an element on to the heap, O(log2(nr)). */ static __always_inline -bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, - const struct min_heap_callbacks *func, void *args) +bool __min_heap_push_inline(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args) { void *data = heap->data; int pos; @@ -201,18 +202,19 @@ bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, heap->nr++; /* Sift child at pos up. */ - __min_heap_sift_up(heap, elem_size, pos, func, args); + __min_heap_sift_up_inline(heap, elem_size, pos, func, args); return true; } -#define min_heap_push(_heap, _element, _func, _args) \ - __min_heap_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args) +#define min_heap_push_inline(_heap, _element, _func, _args) \ + __min_heap_push_inline((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), \ + _func, _args) /* Remove ith element from the heap, O(log2(nr)). */ static __always_inline -bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx, - const struct min_heap_callbacks *func, void *args) +bool __min_heap_del_inline(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args) { void *data = heap->data; @@ -224,12 +226,53 @@ bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx, if (idx == heap->nr) return true; func->swp(data + (idx * elem_size), data + (heap->nr * elem_size), args); - __min_heap_sift_up(heap, elem_size, idx, func, args); - __min_heap_sift_down(heap, idx, elem_size, func, args); + __min_heap_sift_up_inline(heap, elem_size, idx, func, args); + __min_heap_sift_down_inline(heap, idx, elem_size, func, args); return true; } +#define min_heap_del_inline(_heap, _idx, _func, _args) \ + __min_heap_del_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, \ + _func, _args) + +void __min_heap_init(min_heap_char *heap, void *data, int size); +void *__min_heap_peek(struct min_heap_char *heap); +bool __min_heap_full(min_heap_char *heap); +void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size, + const struct min_heap_callbacks *func, void *args); +void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args); +void __min_heapify_all(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args); +bool __min_heap_pop(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args); +void __min_heap_pop_push(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args); +bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args); +bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args); + +#define min_heap_init(_heap, _data, _size) \ + __min_heap_init((min_heap_char *)_heap, _data, _size) +#define min_heap_peek(_heap) \ + (__minheap_cast(_heap) __min_heap_peek((min_heap_char *)_heap)) +#define min_heap_full(_heap) \ + __min_heap_full((min_heap_char *)_heap) +#define min_heap_sift_down(_heap, _pos, _func, _args) \ + __min_heap_sift_down((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), _func, _args) +#define min_heap_sift_up(_heap, _idx, _func, _args) \ + __min_heap_sift_up((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args) +#define min_heapify_all(_heap, _func, _args) \ + __min_heapify_all((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) +#define min_heap_pop(_heap, _func, _args) \ + __min_heap_pop((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) +#define min_heap_pop_push(_heap, _element, _func, _args) \ + __min_heap_pop_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), \ + _func, _args) +#define min_heap_push(_heap, _element, _func, _args) \ + __min_heap_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args) #define min_heap_del(_heap, _idx, _func, _args) \ __min_heap_del((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args) diff --git a/kernel/events/core.c b/kernel/events/core.c index df27d08a7232..1b3c1198b2af 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3870,7 +3870,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx, perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu); } - min_heapify_all(&event_heap, &perf_min_heap, NULL); + min_heapify_all_inline(&event_heap, &perf_min_heap, NULL); while (event_heap.nr) { ret = func(*evt, data); @@ -3879,9 +3879,9 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx, *evt = perf_event_groups_next(*evt, pmu); if (*evt) - min_heap_sift_down(&event_heap, 0, &perf_min_heap, NULL); + min_heap_sift_down_inline(&event_heap, 0, &perf_min_heap, NULL); else - min_heap_pop(&event_heap, &perf_min_heap, NULL); + min_heap_pop_inline(&event_heap, &perf_min_heap, NULL); } return 0; diff --git a/lib/Kconfig b/lib/Kconfig index cf303bd91dda..f5a2781669ea 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -780,3 +780,6 @@ config FIRMWARE_TABLE config UNION_FIND bool + +config MIN_HEAP + bool diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index eda319e9d569..2549b64b2280 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2279,6 +2279,7 @@ config TEST_LIST_SORT config TEST_MIN_HEAP tristate "Min heap test" depends on DEBUG_KERNEL || m + select MIN_HEAP help Enable this to turn on min heap function tests. This test is executed only once during system boot (so affects only boot time), diff --git a/lib/Makefile b/lib/Makefile index feebed74fc7a..1eb89962daef 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -40,6 +40,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ lib-$(CONFIG_UNION_FIND) += union_find.o lib-$(CONFIG_PRINTK) += dump_stack.o lib-$(CONFIG_SMP) += cpumask.o +lib-$(CONFIG_MIN_HEAP) += min_heap.o lib-y += kobject.o klist.o obj-y += lockref.o diff --git a/lib/min_heap.c b/lib/min_heap.c new file mode 100644 index 000000000000..4485372ff3b1 --- /dev/null +++ b/lib/min_heap.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +void __min_heap_init(min_heap_char *heap, void *data, int size) +{ + __min_heap_init_inline(heap, data, size); +} +EXPORT_SYMBOL(__min_heap_init); + +void *__min_heap_peek(struct min_heap_char *heap) +{ + return __min_heap_peek_inline(heap); +} +EXPORT_SYMBOL(__min_heap_peek); + +bool __min_heap_full(min_heap_char *heap) +{ + return __min_heap_full_inline(heap); +} +EXPORT_SYMBOL(__min_heap_full); + +void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + __min_heap_sift_down_inline(heap, pos, elem_size, func, args); +} +EXPORT_SYMBOL(__min_heap_sift_down); + +void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args) +{ + __min_heap_sift_up_inline(heap, elem_size, idx, func, args); +} +EXPORT_SYMBOL(__min_heap_sift_up); + +void __min_heapify_all(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + __min_heapify_all_inline(heap, elem_size, func, args); +} +EXPORT_SYMBOL(__min_heapify_all); + +bool __min_heap_pop(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + return __min_heap_pop_inline(heap, elem_size, func, args); +} +EXPORT_SYMBOL(__min_heap_pop); + +void __min_heap_pop_push(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + __min_heap_pop_push_inline(heap, element, elem_size, func, args); +} +EXPORT_SYMBOL(__min_heap_pop_push); + +bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + return __min_heap_push_inline(heap, element, elem_size, func, args); +} +EXPORT_SYMBOL(__min_heap_push); + +bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args) +{ + return __min_heap_del_inline(heap, elem_size, idx, func, args); +} +EXPORT_SYMBOL(__min_heap_del); From aa5888afc2347ebb394c2c4b694fa3026775009e Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 20 Oct 2024 12:01:52 +0800 Subject: [PATCH 35/71] lib min_heap: optimize min heap by prescaling counters for better performance Improve the efficiency of the min heap by prescaling counters, eliminating the need to repeatedly compute 'index * element_size' when accessing elements. By doing so, we avoid the overhead associated with recalculating the byte offset for each heap operation. However, with prescaling, the calculation for the parent element's location is no longer as simple as '(i - 1) / 2'. To address this, we copy the parent function from 'lib/sort.c', which calculates the parent offset in a branchless manner without using any division instructions. This optimization should result in a more efficient heap implementation by reducing the computational overhead of finding parent and child offsets. Link: https://lkml.kernel.org/r/20241020040200.939973-3-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Kent Overstreet Cc: "Liang, Kan" Cc: Mark Rutland Cc: Matthew Sakai Cc: Matthew Wilcox (Oracle) Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/min_heap.h | 73 +++++++++++++++++++++++++++------------- 1 file changed, 49 insertions(+), 24 deletions(-) diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index 0abb21173979..41b092a14238 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -38,6 +38,32 @@ struct min_heap_callbacks { void (*swp)(void *lhs, void *rhs, void *args); }; +/** + * parent - given the offset of the child, find the offset of the parent. + * @i: the offset of the heap element whose parent is sought. Non-zero. + * @lsbit: a precomputed 1-bit mask, equal to "size & -size" + * @size: size of each element + * + * In terms of array indexes, the parent of element j = @i/@size is simply + * (j-1)/2. But when working in byte offsets, we can't use implicit + * truncation of integer divides. + * + * Fortunately, we only need one bit of the quotient, not the full divide. + * @size has a least significant bit. That bit will be clear if @i is + * an even multiple of @size, and set if it's an odd multiple. + * + * Logically, we're doing "if (i & lsbit) i -= size;", but since the + * branch is unpredictable, it's done with a bit of clever branch-free + * code instead. + */ +__attribute_const__ __always_inline +static size_t parent(size_t i, unsigned int lsbit, size_t size) +{ + i -= size; + i -= size & -(i & lsbit); + return i / 2; +} + /* Initialize a min-heap. */ static __always_inline void __min_heap_init_inline(min_heap_char *heap, void *data, int size) @@ -78,33 +104,30 @@ static __always_inline void __min_heap_sift_down_inline(min_heap_char *heap, int pos, size_t elem_size, const struct min_heap_callbacks *func, void *args) { - void *left, *right; + const unsigned long lsbit = elem_size & -elem_size; void *data = heap->data; - void *root = data + pos * elem_size; - int i = pos, j; + /* pre-scale counters for performance */ + size_t a = pos * elem_size; + size_t b, c, d; + size_t n = heap->nr * elem_size; /* Find the sift-down path all the way to the leaves. */ - for (;;) { - if (i * 2 + 2 >= heap->nr) - break; - left = data + (i * 2 + 1) * elem_size; - right = data + (i * 2 + 2) * elem_size; - i = func->less(left, right, args) ? i * 2 + 1 : i * 2 + 2; - } + for (b = a; c = 2 * b + elem_size, (d = c + elem_size) < n;) + b = func->less(data + c, data + d, args) ? c : d; /* Special case for the last leaf with no sibling. */ - if (i * 2 + 2 == heap->nr) - i = i * 2 + 1; + if (d == n) + b = c; /* Backtrack to the correct location. */ - while (i != pos && func->less(root, data + i * elem_size, args)) - i = (i - 1) / 2; + while (b != a && func->less(data + a, data + b, args)) + b = parent(b, lsbit, elem_size); /* Shift the element into its correct place. */ - j = i; - while (i != pos) { - i = (i - 1) / 2; - func->swp(data + i * elem_size, data + j * elem_size, args); + c = b; + while (b != a) { + b = parent(b, lsbit, elem_size); + func->swp(data + b, data + c, args); } } @@ -117,15 +140,17 @@ static __always_inline void __min_heap_sift_up_inline(min_heap_char *heap, size_t elem_size, size_t idx, const struct min_heap_callbacks *func, void *args) { + const unsigned long lsbit = elem_size & -elem_size; void *data = heap->data; - size_t parent; + /* pre-scale counters for performance */ + size_t a = idx * elem_size, b; - while (idx) { - parent = (idx - 1) / 2; - if (func->less(data + parent * elem_size, data + idx * elem_size, args)) + while (a) { + b = parent(a, lsbit, elem_size); + if (func->less(data + b, data + a, args)) break; - func->swp(data + parent * elem_size, data + idx * elem_size, args); - idx = parent; + func->swp(data + a, data + b, args); + a = b; } } From 03ec56d084611b5a4dc06ffa74db0928616e4d7f Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 20 Oct 2024 12:01:53 +0800 Subject: [PATCH 36/71] lib min_heap: avoid indirect function call by providing default swap The non-inline min heap API can result in an indirect function call to the custom swap function. This becomes particularly costly when CONFIG_MITIGATION_RETPOLINE is enabled, as indirect function calls are expensive in this case. To address this, copy the code from lib/sort.c and provide a default builtin swap implementation that performs element swaps based on the element size. This change allows most users to avoid the overhead of indirect function calls, improving efficiency. Link: https://lkml.kernel.org/r/20241020040200.939973-4-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Kent Overstreet Cc: "Liang, Kan" Cc: Mark Rutland Cc: Matthew Sakai Cc: Matthew Wilcox (Oracle) Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/min_heap.h | 159 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 156 insertions(+), 3 deletions(-) diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index 41b092a14238..e781727c8916 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -38,6 +38,147 @@ struct min_heap_callbacks { void (*swp)(void *lhs, void *rhs, void *args); }; +/** + * is_aligned - is this pointer & size okay for word-wide copying? + * @base: pointer to data + * @size: size of each element + * @align: required alignment (typically 4 or 8) + * + * Returns true if elements can be copied using word loads and stores. + * The size must be a multiple of the alignment, and the base address must + * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. + * + * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)" + * to "if ((a | b) & mask)", so we do that by hand. + */ +__attribute_const__ __always_inline +static bool is_aligned(const void *base, size_t size, unsigned char align) +{ + unsigned char lsbits = (unsigned char)size; + + (void)base; +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + lsbits |= (unsigned char)(uintptr_t)base; +#endif + return (lsbits & (align - 1)) == 0; +} + +/** + * swap_words_32 - swap two elements in 32-bit chunks + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size (must be a multiple of 4) + * + * Exchange the two objects in memory. This exploits base+index addressing, + * which basically all CPUs have, to minimize loop overhead computations. + * + * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the + * bottom of the loop, even though the zero flag is still valid from the + * subtract (since the intervening mov instructions don't alter the flags). + * Gcc 8.1.0 doesn't have that problem. + */ +static __always_inline +void swap_words_32(void *a, void *b, size_t n) +{ + do { + u32 t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; + } while (n); +} + +/** + * swap_words_64 - swap two elements in 64-bit chunks + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size (must be a multiple of 8) + * + * Exchange the two objects in memory. This exploits base+index + * addressing, which basically all CPUs have, to minimize loop overhead + * computations. + * + * We'd like to use 64-bit loads if possible. If they're not, emulating + * one requires base+index+4 addressing which x86 has but most other + * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads, + * but it's possible to have 64-bit loads without 64-bit pointers (e.g. + * x32 ABI). Are there any cases the kernel needs to worry about? + */ +static __always_inline +void swap_words_64(void *a, void *b, size_t n) +{ + do { +#ifdef CONFIG_64BIT + u64 t = *(u64 *)(a + (n -= 8)); + *(u64 *)(a + n) = *(u64 *)(b + n); + *(u64 *)(b + n) = t; +#else + /* Use two 32-bit transfers to avoid base+index+4 addressing */ + u32 t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; + + t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; +#endif + } while (n); +} + +/** + * swap_bytes - swap two elements a byte at a time + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size + * + * This is the fallback if alignment doesn't allow using larger chunks. + */ +static __always_inline +void swap_bytes(void *a, void *b, size_t n) +{ + do { + char t = ((char *)a)[--n]; + ((char *)a)[n] = ((char *)b)[n]; + ((char *)b)[n] = t; + } while (n); +} + +/* + * The values are arbitrary as long as they can't be confused with + * a pointer, but small integers make for the smallest compare + * instructions. + */ +#define SWAP_WORDS_64 ((void (*)(void *, void *, void *))0) +#define SWAP_WORDS_32 ((void (*)(void *, void *, void *))1) +#define SWAP_BYTES ((void (*)(void *, void *, void *))2) + +/* + * Selects the appropriate swap function based on the element size. + */ +static __always_inline +void *select_swap_func(const void *base, size_t size) +{ + if (is_aligned(base, size, 8)) + return SWAP_WORDS_64; + else if (is_aligned(base, size, 4)) + return SWAP_WORDS_32; + else + return SWAP_BYTES; +} + +static __always_inline +void do_swap(void *a, void *b, size_t size, void (*swap_func)(void *lhs, void *rhs, void *args), + void *priv) +{ + if (swap_func == SWAP_WORDS_64) + swap_words_64(a, b, size); + else if (swap_func == SWAP_WORDS_32) + swap_words_32(a, b, size); + else if (swap_func == SWAP_BYTES) + swap_bytes(a, b, size); + else + swap_func(a, b, priv); +} + /** * parent - given the offset of the child, find the offset of the parent. * @i: the offset of the heap element whose parent is sought. Non-zero. @@ -106,11 +247,15 @@ void __min_heap_sift_down_inline(min_heap_char *heap, int pos, size_t elem_size, { const unsigned long lsbit = elem_size & -elem_size; void *data = heap->data; + void (*swp)(void *lhs, void *rhs, void *args) = func->swp; /* pre-scale counters for performance */ size_t a = pos * elem_size; size_t b, c, d; size_t n = heap->nr * elem_size; + if (!swp) + swp = select_swap_func(data, elem_size); + /* Find the sift-down path all the way to the leaves. */ for (b = a; c = 2 * b + elem_size, (d = c + elem_size) < n;) b = func->less(data + c, data + d, args) ? c : d; @@ -127,7 +272,7 @@ void __min_heap_sift_down_inline(min_heap_char *heap, int pos, size_t elem_size, c = b; while (b != a) { b = parent(b, lsbit, elem_size); - func->swp(data + b, data + c, args); + do_swap(data + b, data + c, elem_size, swp, args); } } @@ -142,14 +287,18 @@ void __min_heap_sift_up_inline(min_heap_char *heap, size_t elem_size, size_t idx { const unsigned long lsbit = elem_size & -elem_size; void *data = heap->data; + void (*swp)(void *lhs, void *rhs, void *args) = func->swp; /* pre-scale counters for performance */ size_t a = idx * elem_size, b; + if (!swp) + swp = select_swap_func(data, elem_size); + while (a) { b = parent(a, lsbit, elem_size); if (func->less(data + b, data + a, args)) break; - func->swp(data + a, data + b, args); + do_swap(data + a, data + b, elem_size, swp, args); a = b; } } @@ -242,15 +391,19 @@ bool __min_heap_del_inline(min_heap_char *heap, size_t elem_size, size_t idx, const struct min_heap_callbacks *func, void *args) { void *data = heap->data; + void (*swp)(void *lhs, void *rhs, void *args) = func->swp; if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap")) return false; + if (!swp) + swp = select_swap_func(data, elem_size); + /* Place last element at the root (position 0) and then sift down. */ heap->nr--; if (idx == heap->nr) return true; - func->swp(data + (idx * elem_size), data + (heap->nr * elem_size), args); + do_swap(data + (idx * elem_size), data + (heap->nr * elem_size), elem_size, swp, args); __min_heap_sift_up_inline(heap, elem_size, idx, func, args); __min_heap_sift_down_inline(heap, idx, elem_size, func, args); From d559bb2c6deea1fb4650b8a784b27e87ea12f71d Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 20 Oct 2024 12:01:54 +0800 Subject: [PATCH 37/71] lib/test_min_heap: update min_heap_callbacks to use default builtin swap Replace the swp function pointer in the min_heap_callbacks of test_min_heap with NULL, allowing direct usage of the default builtin swap implementation. This modification simplifies the code and improves performance by removing unnecessary function indirection. Link: https://lkml.kernel.org/r/20241020040200.939973-5-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Kent Overstreet Cc: "Liang, Kan" Cc: Mark Rutland Cc: Matthew Sakai Cc: Matthew Wilcox (Oracle) Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- lib/test_min_heap.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/lib/test_min_heap.c b/lib/test_min_heap.c index 64c877e73b64..e6fbb798558b 100644 --- a/lib/test_min_heap.c +++ b/lib/test_min_heap.c @@ -23,14 +23,6 @@ static __init bool greater_than(const void *lhs, const void *rhs, void __always_ return *(int *)lhs > *(int *)rhs; } -static __init void swap_ints(void *lhs, void *rhs, void __always_unused *args) -{ - int temp = *(int *)lhs; - - *(int *)lhs = *(int *)rhs; - *(int *)rhs = temp; -} - static __init int pop_verify_heap(bool min_heap, struct min_heap_test *heap, const struct min_heap_callbacks *funcs) @@ -72,7 +64,7 @@ static __init int test_heapify_all(bool min_heap) }; struct min_heap_callbacks funcs = { .less = min_heap ? less_than : greater_than, - .swp = swap_ints, + .swp = NULL, }; int i, err; @@ -104,7 +96,7 @@ static __init int test_heap_push(bool min_heap) }; struct min_heap_callbacks funcs = { .less = min_heap ? less_than : greater_than, - .swp = swap_ints, + .swp = NULL, }; int i, temp, err; @@ -136,7 +128,7 @@ static __init int test_heap_pop_push(bool min_heap) }; struct min_heap_callbacks funcs = { .less = min_heap ? less_than : greater_than, - .swp = swap_ints, + .swp = NULL, }; int i, temp, err; @@ -175,7 +167,7 @@ static __init int test_heap_del(bool min_heap) heap.nr = ARRAY_SIZE(values); struct min_heap_callbacks funcs = { .less = min_heap ? less_than : greater_than, - .swp = swap_ints, + .swp = NULL, }; int i, err; From 083ad2871a8bbaf404b97eaa5e713e427e229f6b Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 20 Oct 2024 12:01:55 +0800 Subject: [PATCH 38/71] perf/core: update min_heap_callbacks to use default builtin swap After introducing the default builtin swap implementation, update the min_heap_callbacks to replace the swp function pointer with NULL. This change allows the min heap to directly utilize the builtin swap, simplifying the code. Link: https://lkml.kernel.org/r/20241020040200.939973-6-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Kent Overstreet Cc: "Liang, Kan" Cc: Mark Rutland Cc: Matthew Sakai Cc: Matthew Wilcox (Oracle) Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- kernel/events/core.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 1b3c1198b2af..c2b4d7ee6296 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3778,18 +3778,11 @@ static bool perf_less_group_idx(const void *l, const void *r, void __always_unus return le->group_index < re->group_index; } -static void swap_ptr(void *l, void *r, void __always_unused *args) -{ - void **lp = l, **rp = r; - - swap(*lp, *rp); -} - DEFINE_MIN_HEAP(struct perf_event *, perf_event_min_heap); static const struct min_heap_callbacks perf_min_heap = { .less = perf_less_group_idx, - .swp = swap_ptr, + .swp = NULL, }; static void __heap_add(struct perf_event_min_heap *heap, struct perf_event *event) From d6844302074aac634afd00c4b70a1e1249afeff3 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 20 Oct 2024 12:01:56 +0800 Subject: [PATCH 39/71] dm vdo: update min_heap_callbacks to use default builtin swap Replace the swp function pointer in the min_heap_callbacks of dm-vdo with NULL, allowing direct usage of the default builtin swap implementation. This modification simplifies the code and improves performance by removing unnecessary function indirection. Link: https://lkml.kernel.org/r/20241020040200.939973-7-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Kent Overstreet Cc: "Liang, Kan" Cc: Mark Rutland Cc: Matthew Sakai Cc: Matthew Wilcox (Oracle) Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- drivers/md/dm-vdo/repair.c | 2 +- drivers/md/dm-vdo/slab-depot.c | 10 +--------- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/md/dm-vdo/repair.c b/drivers/md/dm-vdo/repair.c index ffff2c999518..8c006fb3afcf 100644 --- a/drivers/md/dm-vdo/repair.c +++ b/drivers/md/dm-vdo/repair.c @@ -166,7 +166,7 @@ static void swap_mappings(void *item1, void *item2, void __always_unused *args) static const struct min_heap_callbacks repair_min_heap = { .less = mapping_is_less_than, - .swp = swap_mappings, + .swp = NULL, }; static struct numbered_block_mapping *sort_next_heap_element(struct repair_completion *repair) diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c index 274f9ccd072f..ccf7b2eac131 100644 --- a/drivers/md/dm-vdo/slab-depot.c +++ b/drivers/md/dm-vdo/slab-depot.c @@ -3301,17 +3301,9 @@ static bool slab_status_is_less_than(const void *item1, const void *item2, return info1->slab_number < info2->slab_number; } -static void swap_slab_statuses(void *item1, void *item2, void __always_unused *args) -{ - struct slab_status *info1 = item1; - struct slab_status *info2 = item2; - - swap(*info1, *info2); -} - static const struct min_heap_callbacks slab_status_min_heap = { .less = slab_status_is_less_than, - .swp = swap_slab_statuses, + .swp = NULL, }; /* Inform the slab actor that a action has finished on some slab; used by apply_to_slabs(). */ From 3d8a9a1c35227c3f1b0bd132c9f0a80dbda07b65 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 20 Oct 2024 12:01:57 +0800 Subject: [PATCH 40/71] bcache: update min_heap_callbacks to use default builtin swap Replace the swp function pointer in the min_heap_callbacks of bcache with NULL, allowing direct usage of the default builtin swap implementation. This modification simplifies the code and improves performance by removing unnecessary function indirection. Link: https://lkml.kernel.org/r/20241020040200.939973-8-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Kent Overstreet Cc: "Liang, Kan" Cc: Mark Rutland Cc: Matthew Sakai Cc: Matthew Wilcox (Oracle) Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- drivers/md/bcache/alloc.c | 11 ++--------- drivers/md/bcache/bset.c | 14 +++----------- drivers/md/bcache/extents.c | 10 +--------- drivers/md/bcache/movinggc.c | 10 +--------- 4 files changed, 7 insertions(+), 38 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index da50f6661bae..8998e61efa40 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -189,23 +189,16 @@ static inline bool new_bucket_min_cmp(const void *l, const void *r, void *args) return new_bucket_prio(ca, *lhs) < new_bucket_prio(ca, *rhs); } -static inline void new_bucket_swap(void *l, void *r, void __always_unused *args) -{ - struct bucket **lhs = l, **rhs = r; - - swap(*lhs, *rhs); -} - static void invalidate_buckets_lru(struct cache *ca) { struct bucket *b; const struct min_heap_callbacks bucket_max_cmp_callback = { .less = new_bucket_max_cmp, - .swp = new_bucket_swap, + .swp = NULL, }; const struct min_heap_callbacks bucket_min_cmp_callback = { .less = new_bucket_min_cmp, - .swp = new_bucket_swap, + .swp = NULL, }; ca->heap.nr = 0; diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index bd97d8626887..68258a16e125 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -1093,14 +1093,6 @@ static inline bool new_btree_iter_cmp(const void *l, const void *r, void __alway return bkey_cmp(_l->k, _r->k) <= 0; } -static inline void new_btree_iter_swap(void *iter1, void *iter2, void __always_unused *args) -{ - struct btree_iter_set *_iter1 = iter1; - struct btree_iter_set *_iter2 = iter2; - - swap(*_iter1, *_iter2); -} - static inline bool btree_iter_end(struct btree_iter *iter) { return !iter->heap.nr; @@ -1111,7 +1103,7 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, { const struct min_heap_callbacks callbacks = { .less = new_btree_iter_cmp, - .swp = new_btree_iter_swap, + .swp = NULL, }; if (k != end) @@ -1157,7 +1149,7 @@ static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, struct bkey *ret = NULL; const struct min_heap_callbacks callbacks = { .less = cmp, - .swp = new_btree_iter_swap, + .swp = NULL, }; if (!btree_iter_end(iter)) { @@ -1231,7 +1223,7 @@ static void btree_mergesort(struct btree_keys *b, struct bset *out, : bch_ptr_invalid; const struct min_heap_callbacks callbacks = { .less = b->ops->sort_cmp, - .swp = new_btree_iter_swap, + .swp = NULL, }; /* Heapify the iterator, using our comparison function */ diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index a7221e5dbe81..4b84fda1530a 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -266,20 +266,12 @@ static bool new_bch_extent_sort_cmp(const void *l, const void *r, void __always_ return !(c ? c > 0 : _l->k < _r->k); } -static inline void new_btree_iter_swap(void *iter1, void *iter2, void __always_unused *args) -{ - struct btree_iter_set *_iter1 = iter1; - struct btree_iter_set *_iter2 = iter2; - - swap(*_iter1, *_iter2); -} - static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, struct bkey *tmp) { const struct min_heap_callbacks callbacks = { .less = new_bch_extent_sort_cmp, - .swp = new_btree_iter_swap, + .swp = NULL, }; while (iter->heap.nr > 1) { struct btree_iter_set *top = iter->heap.data, *i = top + 1; diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 7f482729c56d..ef6abf33f926 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -190,14 +190,6 @@ static bool new_bucket_cmp(const void *l, const void *r, void __always_unused *a return GC_SECTORS_USED(*_l) >= GC_SECTORS_USED(*_r); } -static void new_bucket_swap(void *l, void *r, void __always_unused *args) -{ - struct bucket **_l = l; - struct bucket **_r = r; - - swap(*_l, *_r); -} - static unsigned int bucket_heap_top(struct cache *ca) { struct bucket *b; @@ -212,7 +204,7 @@ void bch_moving_gc(struct cache_set *c) unsigned long sectors_to_move, reserve_sectors; const struct min_heap_callbacks callbacks = { .less = new_bucket_cmp, - .swp = new_bucket_swap, + .swp = NULL, }; if (!c->copy_gc_enabled) From 06ce25145bb864e5d948c4bc7e35bdb460a4e597 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 20 Oct 2024 12:01:58 +0800 Subject: [PATCH 41/71] bcachefs: clean up duplicate min_heap_callbacks declarations Refactor the bcachefs code to remove multiple redundant declarations of min_heap_callbacks, ensuring that each unique declaration appears only once. Link: https://lore.kernel.org/20241017095520.GV16066@noisy.programming.kicks-ass.net Link: https://lkml.kernel.org/r/20241020040200.939973-9-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Suggested-by: Peter Zijlstra Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Kent Overstreet Cc: "Liang, Kan" Cc: Mark Rutland Cc: Matthew Sakai Cc: Matthew Wilcox (Oracle) Cc: Namhyung Kim Signed-off-by: Andrew Morton --- fs/bcachefs/clock.c | 19 +++++-------------- fs/bcachefs/ec.c | 19 +++++-------------- 2 files changed, 10 insertions(+), 28 deletions(-) diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c index 1d6b691e8da6..1fcfcb5fd44f 100644 --- a/fs/bcachefs/clock.c +++ b/fs/bcachefs/clock.c @@ -22,13 +22,13 @@ static inline void io_timer_swp(void *l, void *r, void __always_unused *args) swap(*_l, *_r); } +static const struct min_heap_callbacks callbacks = { + .less = io_timer_cmp, + .swp = io_timer_swp, +}; + void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) { - const struct min_heap_callbacks callbacks = { - .less = io_timer_cmp, - .swp = io_timer_swp, - }; - spin_lock(&clock->timer_lock); if (time_after_eq64((u64) atomic64_read(&clock->now), timer->expire)) { @@ -48,11 +48,6 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) { - const struct min_heap_callbacks callbacks = { - .less = io_timer_cmp, - .swp = io_timer_swp, - }; - spin_lock(&clock->timer_lock); for (size_t i = 0; i < clock->timers.nr; i++) @@ -142,10 +137,6 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now) { struct io_timer *ret = NULL; - const struct min_heap_callbacks callbacks = { - .less = io_timer_cmp, - .swp = io_timer_swp, - }; if (clock->timers.nr && time_after_eq64(now, clock->timers.data[0]->expire)) { diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 749dcf368841..6094afb0c6be 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1057,6 +1057,11 @@ static inline void ec_stripes_heap_swap(void *l, void *r, void *h) ec_stripes_heap_set_backpointer(_h, j); } +static const struct min_heap_callbacks callbacks = { + .less = ec_stripes_heap_cmp, + .swp = ec_stripes_heap_swap, +}; + static void heap_verify_backpointer(struct bch_fs *c, size_t idx) { ec_stripes_heap *h = &c->ec_stripes_heap; @@ -1069,11 +1074,6 @@ static void heap_verify_backpointer(struct bch_fs *c, size_t idx) void bch2_stripes_heap_del(struct bch_fs *c, struct stripe *m, size_t idx) { - const struct min_heap_callbacks callbacks = { - .less = ec_stripes_heap_cmp, - .swp = ec_stripes_heap_swap, - }; - mutex_lock(&c->ec_stripes_heap_lock); heap_verify_backpointer(c, idx); @@ -1084,11 +1084,6 @@ void bch2_stripes_heap_del(struct bch_fs *c, void bch2_stripes_heap_insert(struct bch_fs *c, struct stripe *m, size_t idx) { - const struct min_heap_callbacks callbacks = { - .less = ec_stripes_heap_cmp, - .swp = ec_stripes_heap_swap, - }; - mutex_lock(&c->ec_stripes_heap_lock); BUG_ON(min_heap_full(&c->ec_stripes_heap)); @@ -1107,10 +1102,6 @@ void bch2_stripes_heap_insert(struct bch_fs *c, void bch2_stripes_heap_update(struct bch_fs *c, struct stripe *m, size_t idx) { - const struct min_heap_callbacks callbacks = { - .less = ec_stripes_heap_cmp, - .swp = ec_stripes_heap_swap, - }; ec_stripes_heap *h = &c->ec_stripes_heap; bool do_deletes; size_t i; From 75e849f3d0972e28d53fcfb540593c699a61c095 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 20 Oct 2024 12:01:59 +0800 Subject: [PATCH 42/71] bcachefs: update min_heap_callbacks to use default builtin swap Replace the swp function pointer in the min_heap_callbacks of bcachefs with NULL, allowing direct usage of the default builtin swap implementation. This modification simplifies the code and improves performance by removing unnecessary function indirection. Link: https://lkml.kernel.org/r/20241020040200.939973-10-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Kent Overstreet Cc: "Liang, Kan" Cc: Mark Rutland Cc: Matthew Sakai Cc: Matthew Wilcox (Oracle) Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- fs/bcachefs/clock.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c index 1fcfcb5fd44f..1f8e035d7119 100644 --- a/fs/bcachefs/clock.c +++ b/fs/bcachefs/clock.c @@ -14,17 +14,9 @@ static inline bool io_timer_cmp(const void *l, const void *r, void __always_unus return (*_l)->expire < (*_r)->expire; } -static inline void io_timer_swp(void *l, void *r, void __always_unused *args) -{ - struct io_timer **_l = (struct io_timer **)l; - struct io_timer **_r = (struct io_timer **)r; - - swap(*_l, *_r); -} - static const struct min_heap_callbacks callbacks = { .less = io_timer_cmp, - .swp = io_timer_swp, + .swp = NULL, }; void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) From ec7c2bda802191241940e333fb199edf4b9ea67c Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 20 Oct 2024 12:02:00 +0800 Subject: [PATCH 43/71] Documentation/core-api: add min heap API introduction Introduce an overview of the min heap API, detailing its usage and functionality. The documentation aims to provide developers with a clear understanding of how to implement and utilize min heaps within the Linux kernel, enhancing the overall accessibility of this data structure. Link: https://lkml.kernel.org/r/20241020040200.939973-11-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Reviewed-by: Bagas Sanjaya Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Kent Overstreet Cc: "Liang, Kan" Cc: Mark Rutland Cc: Matthew Sakai Cc: Matthew Wilcox (Oracle) Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- Documentation/core-api/index.rst | 1 + Documentation/core-api/min_heap.rst | 300 ++++++++++++++++++++++++++++ 2 files changed, 301 insertions(+) create mode 100644 Documentation/core-api/min_heap.rst diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index 6a875743dd4b..563b8fc0002f 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -52,6 +52,7 @@ Library functionality that is used throughout the kernel. wrappers/atomic_bitops floating-point union_find + min_heap Low level entry and exit ======================== diff --git a/Documentation/core-api/min_heap.rst b/Documentation/core-api/min_heap.rst new file mode 100644 index 000000000000..0c636c8b7aa5 --- /dev/null +++ b/Documentation/core-api/min_heap.rst @@ -0,0 +1,300 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============ +Min Heap API +============ + +Introduction +============ + +The Min Heap API provides a set of functions and macros for managing min-heaps +in the Linux kernel. A min-heap is a binary tree structure where the value of +each node is less than or equal to the values of its children, ensuring that +the smallest element is always at the root. + +This document provides a guide to the Min Heap API, detailing how to define and +use min-heaps. Users should not directly call functions with **__min_heap_*()** +prefixes, but should instead use the provided macro wrappers. + +In addition to the standard version of the functions, the API also includes a +set of inline versions for performance-critical scenarios. These inline +functions have the same names as their non-inline counterparts but include an +**_inline** suffix. For example, **__min_heap_init_inline** and its +corresponding macro wrapper **min_heap_init_inline**. The inline versions allow +custom comparison and swap functions to be called directly, rather than through +indirect function calls. This can significantly reduce overhead, especially +when CONFIG_MITIGATION_RETPOLINE is enabled, as indirect function calls become +more expensive. As with the non-inline versions, it is important to use the +macro wrappers for inline functions instead of directly calling the functions +themselves. + +Data Structures +=============== + +Min-Heap Definition +------------------- + +The core data structure for representing a min-heap is defined using the +**MIN_HEAP_PREALLOCATED** and **DEFINE_MIN_HEAP** macros. These macros allow +you to define a min-heap with a preallocated buffer or dynamically allocated +memory. + +Example: + +.. code-block:: c + + #define MIN_HEAP_PREALLOCATED(_type, _name, _nr) + struct _name { + int nr; /* Number of elements in the heap */ + int size; /* Maximum number of elements that can be held */ + _type *data; /* Pointer to the heap data */ + _type preallocated[_nr]; /* Static preallocated array */ + } + + #define DEFINE_MIN_HEAP(_type, _name) MIN_HEAP_PREALLOCATED(_type, _name, 0) + +A typical heap structure will include a counter for the number of elements +(`nr`), the maximum capacity of the heap (`size`), and a pointer to an array of +elements (`data`). Optionally, you can specify a static array for preallocated +heap storage using **MIN_HEAP_PREALLOCATED**. + +Min Heap Callbacks +------------------ + +The **struct min_heap_callbacks** provides customization options for ordering +elements in the heap and swapping them. It contains two function pointers: + +.. code-block:: c + + struct min_heap_callbacks { + bool (*less)(const void *lhs, const void *rhs, void *args); + void (*swp)(void *lhs, void *rhs, void *args); + }; + +- **less** is the comparison function used to establish the order of elements. +- **swp** is a function for swapping elements in the heap. If swp is set to + NULL, the default swap function will be used, which swaps the elements based on their size + +Macro Wrappers +============== + +The following macro wrappers are provided for interacting with the heap in a +user-friendly manner. Each macro corresponds to a function that operates on the +heap, and they abstract away direct calls to internal functions. + +Each macro accepts various parameters that are detailed below. + +Heap Initialization +-------------------- + +.. code-block:: c + + min_heap_init(heap, data, size); + +- **heap**: A pointer to the min-heap structure to be initialized. +- **data**: A pointer to the buffer where the heap elements will be stored. If + `NULL`, the preallocated buffer within the heap structure will be used. +- **size**: The maximum number of elements the heap can hold. + +This macro initializes the heap, setting its initial state. If `data` is +`NULL`, the preallocated memory inside the heap structure will be used for +storage. Otherwise, the user-provided buffer is used. The operation is **O(1)**. + +**Inline Version:** min_heap_init_inline(heap, data, size) + +Accessing the Top Element +------------------------- + +.. code-block:: c + + element = min_heap_peek(heap); + +- **heap**: A pointer to the min-heap from which to retrieve the smallest + element. + +This macro returns a pointer to the smallest element (the root) of the heap, or +`NULL` if the heap is empty. The operation is **O(1)**. + +**Inline Version:** min_heap_peek_inline(heap) + +Heap Insertion +-------------- + +.. code-block:: c + + success = min_heap_push(heap, element, callbacks, args); + +- **heap**: A pointer to the min-heap into which the element should be inserted. +- **element**: A pointer to the element to be inserted into the heap. +- **callbacks**: A pointer to a `struct min_heap_callbacks` providing the + `less` and `swp` functions. +- **args**: Optional arguments passed to the `less` and `swp` functions. + +This macro inserts an element into the heap. It returns `true` if the insertion +was successful and `false` if the heap is full. The operation is **O(log n)**. + +**Inline Version:** min_heap_push_inline(heap, element, callbacks, args) + +Heap Removal +------------ + +.. code-block:: c + + success = min_heap_pop(heap, callbacks, args); + +- **heap**: A pointer to the min-heap from which to remove the smallest element. +- **callbacks**: A pointer to a `struct min_heap_callbacks` providing the + `less` and `swp` functions. +- **args**: Optional arguments passed to the `less` and `swp` functions. + +This macro removes the smallest element (the root) from the heap. It returns +`true` if the element was successfully removed, or `false` if the heap is +empty. The operation is **O(log n)**. + +**Inline Version:** min_heap_pop_inline(heap, callbacks, args) + +Heap Maintenance +---------------- + +You can use the following macros to maintain the heap's structure: + +.. code-block:: c + + min_heap_sift_down(heap, pos, callbacks, args); + +- **heap**: A pointer to the min-heap. +- **pos**: The index from which to start sifting down. +- **callbacks**: A pointer to a `struct min_heap_callbacks` providing the + `less` and `swp` functions. +- **args**: Optional arguments passed to the `less` and `swp` functions. + +This macro restores the heap property by moving the element at the specified +index (`pos`) down the heap until it is in the correct position. The operation +is **O(log n)**. + +**Inline Version:** min_heap_sift_down_inline(heap, pos, callbacks, args) + +.. code-block:: c + + min_heap_sift_up(heap, idx, callbacks, args); + +- **heap**: A pointer to the min-heap. +- **idx**: The index of the element to sift up. +- **callbacks**: A pointer to a `struct min_heap_callbacks` providing the + `less` and `swp` functions. +- **args**: Optional arguments passed to the `less` and `swp` functions. + +This macro restores the heap property by moving the element at the specified +index (`idx`) up the heap. The operation is **O(log n)**. + +**Inline Version:** min_heap_sift_up_inline(heap, idx, callbacks, args) + +.. code-block:: c + + min_heapify_all(heap, callbacks, args); + +- **heap**: A pointer to the min-heap. +- **callbacks**: A pointer to a `struct min_heap_callbacks` providing the + `less` and `swp` functions. +- **args**: Optional arguments passed to the `less` and `swp` functions. + +This macro ensures that the entire heap satisfies the heap property. It is +called when the heap is built from scratch or after many modifications. The +operation is **O(n)**. + +**Inline Version:** min_heapify_all_inline(heap, callbacks, args) + +Removing Specific Elements +-------------------------- + +.. code-block:: c + + success = min_heap_del(heap, idx, callbacks, args); + +- **heap**: A pointer to the min-heap. +- **idx**: The index of the element to delete. +- **callbacks**: A pointer to a `struct min_heap_callbacks` providing the + `less` and `swp` functions. +- **args**: Optional arguments passed to the `less` and `swp` functions. + +This macro removes an element at the specified index (`idx`) from the heap and +restores the heap property. The operation is **O(log n)**. + +**Inline Version:** min_heap_del_inline(heap, idx, callbacks, args) + +Other Utilities +=============== + +- **min_heap_full(heap)**: Checks whether the heap is full. + Complexity: **O(1)**. + +.. code-block:: c + + bool full = min_heap_full(heap); + +- `heap`: A pointer to the min-heap to check. + +This macro returns `true` if the heap is full, otherwise `false`. + +**Inline Version:** min_heap_full_inline(heap) + +- **min_heap_empty(heap)**: Checks whether the heap is empty. + Complexity: **O(1)**. + +.. code-block:: c + + bool empty = min_heap_empty(heap); + +- `heap`: A pointer to the min-heap to check. + +This macro returns `true` if the heap is empty, otherwise `false`. + +**Inline Version:** min_heap_empty_inline(heap) + +Example Usage +============= + +An example usage of the min-heap API would involve defining a heap structure, +initializing it, and inserting and removing elements as needed. + +.. code-block:: c + + #include + + int my_less_function(const void *lhs, const void *rhs, void *args) { + return (*(int *)lhs < *(int *)rhs); + } + + struct min_heap_callbacks heap_cb = { + .less = my_less_function, /* Comparison function for heap order */ + .swp = NULL, /* Use default swap function */ + }; + + void example_usage(void) { + /* Pre-populate the buffer with elements */ + int buffer[5] = {5, 2, 8, 1, 3}; + /* Declare a min-heap */ + DEFINE_MIN_HEAP(int, my_heap); + + /* Initialize the heap with preallocated buffer and size */ + min_heap_init(&my_heap, buffer, 5); + + /* Build the heap using min_heapify_all */ + my_heap.nr = 5; /* Set the number of elements in the heap */ + min_heapify_all(&my_heap, &heap_cb, NULL); + + /* Peek at the top element (should be 1 in this case) */ + int *top = min_heap_peek(&my_heap); + pr_info("Top element: %d\n", *top); + + /* Pop the top element (1) and get the new top (2) */ + min_heap_pop(&my_heap, &heap_cb, NULL); + top = min_heap_peek(&my_heap); + pr_info("New top element: %d\n", *top); + + /* Insert a new element (0) and recheck the top */ + int new_element = 0; + min_heap_push(&my_heap, &new_element, &heap_cb, NULL); + top = min_heap_peek(&my_heap); + pr_info("Top element after insertion: %d\n", *top); + } From 3ad563b1371b95f40b045b3bfa82848174e32a4c Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 27 Oct 2024 08:40:03 +0800 Subject: [PATCH 44/71] MAINTAINERS: add entry for min heap library code Add a new entry in the MAINTAINERS file for the min heap library code, with myself as the maintainer. I am pleased to take on the responsibility of maintaining and reviewing patches for this library, as I am well-acquainted with its details through recent contributions. Link: https://lkml.kernel.org/r/20241027004003.987934-1-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Cc: Kuan-Wei Chiu Signed-off-by: Andrew Morton --- MAINTAINERS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index bdae0faf000c..c8c3baa4740c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15461,6 +15461,15 @@ F: arch/arm/boot/dts/marvell/armada-xp-crs326-24g-2s.dts F: arch/arm/boot/dts/marvell/armada-xp-crs328-4c-20s-4s-bit.dts F: arch/arm/boot/dts/marvell/armada-xp-crs328-4c-20s-4s.dts +MIN HEAP +M: Kuan-Wei Chiu +L: linux-kernel@vger.kernel.org +S: Maintained +F: Documentation/core-api/min_heap.rst +F: include/linux/min_heap.h +F: lib/min_heap.c +F: lib/test_min_heap.c + MIPI CCS, SMIA AND SMIA++ IMAGE SENSOR DRIVER M: Sakari Ailus L: linux-media@vger.kernel.org From 25f12e46a0e05f5f75fd434e8098564e6f6793a5 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 24 Oct 2024 18:25:35 +0900 Subject: [PATCH 45/71] nilfs2: convert segment buffer to be folio-based Patch series "nilfs2: Finish folio conversion". This series converts all remaining page structure references in nilfs2 to folio-based, except for nilfs_copy_buffer function, which was converted to use folios in advance for cross-fs page flags cleanup. This prioritizes folio conversion, and does not include buffer head reference reduction, nor does it support for block sizes larger than the system page size. The first eight patches in this series mainly convert each of the nilfs2-specific metadata implementations to use folios. The last four patches, by Matthew Wilcox, eliminate aops writepage callbacks and convert the remaining page structure references to folio-based. This part reflects some corrections to the patch series posted by Matthew. This patch (of 12): In the segment buffer (log buffer) implementation, two parts of the block buffer, CRC calculation and bio preparation, are still page-based, so convert them to folio-based. Link: https://lkml.kernel.org/r/20241024092602.13395-1-konishi.ryusuke@gmail.com Link: https://lkml.kernel.org/r/20241024092602.13395-2-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/nilfs2/segbuf.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index dc431b4c34c9..e08cab03366b 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -205,7 +205,6 @@ static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf, { struct buffer_head *bh; struct nilfs_segment_summary *raw_sum; - void *kaddr; u32 crc; bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head, @@ -220,9 +219,13 @@ static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf, crc = crc32_le(crc, bh->b_data, bh->b_size); } list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - kaddr = kmap_local_page(bh->b_page); - crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size); - kunmap_local(kaddr); + size_t offset = offset_in_folio(bh->b_folio, bh->b_data); + unsigned char *from; + + /* Do not support block sizes larger than PAGE_SIZE */ + from = kmap_local_folio(bh->b_folio, offset); + crc = crc32_le(crc, from, bh->b_size); + kunmap_local(from); } raw_sum->ss_datasum = cpu_to_le32(crc); } @@ -374,7 +377,7 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf, struct nilfs_write_info *wi, struct buffer_head *bh) { - int len, err; + int err; BUG_ON(wi->nr_vecs <= 0); repeat: @@ -385,8 +388,8 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf, (wi->nilfs->ns_blocksize_bits - 9); } - len = bio_add_page(wi->bio, bh->b_page, bh->b_size, bh_offset(bh)); - if (len == bh->b_size) { + if (bio_add_folio(wi->bio, bh->b_folio, bh->b_size, + offset_in_folio(bh->b_folio, bh->b_data))) { wi->end++; return 0; } From 4fd0a096f46887822b1138677510980fe03c002b Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 24 Oct 2024 18:25:36 +0900 Subject: [PATCH 46/71] nilfs2: convert common metadata file code to be folio-based In the common routines for metadata files, nilfs_mdt_insert_new_block(), which inserts a new block buffer into the cache, is still page-based, and there are two places where bh_offset() is used. Convert these to page-based. Link: https://lkml.kernel.org/r/20241024092602.13395-3-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/nilfs2/alloc.c | 8 +++++--- fs/nilfs2/cpfile.c | 4 ++-- fs/nilfs2/mdt.c | 21 +++++++++++++-------- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index ba50388ee4bf..d30dfed707b6 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -177,12 +177,14 @@ nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr) * nilfs_palloc_desc_block_init - initialize buffer of a group descriptor block * @inode: inode of metadata file * @bh: buffer head of the buffer to be initialized - * @kaddr: kernel address mapped for the page including the buffer + * @from: kernel address mapped for a chunk of the block + * + * This function does not yet support the case where block size > PAGE_SIZE. */ static void nilfs_palloc_desc_block_init(struct inode *inode, - struct buffer_head *bh, void *kaddr) + struct buffer_head *bh, void *from) { - struct nilfs_palloc_group_desc *desc = kaddr + bh_offset(bh); + struct nilfs_palloc_group_desc *desc = from; unsigned long n = nilfs_palloc_groups_per_desc_block(inode); __le32 nfrees; diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index f0ce37552446..a8046cbf2753 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -113,9 +113,9 @@ nilfs_cpfile_block_get_checkpoint(const struct inode *cpfile, __u64 cno, static void nilfs_cpfile_block_init(struct inode *cpfile, struct buffer_head *bh, - void *kaddr) + void *from) { - struct nilfs_checkpoint *cp = kaddr + bh_offset(bh); + struct nilfs_checkpoint *cp = from; size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; int n = nilfs_cpfile_checkpoints_per_block(cpfile); diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index ceb7dc0b5bad..a4c1e00aaaac 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -33,7 +33,8 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, struct buffer_head *, void *)) { struct nilfs_inode_info *ii = NILFS_I(inode); - void *kaddr; + struct folio *folio = bh->b_folio; + void *from; int ret; /* Caller exclude read accesses using page lock */ @@ -47,12 +48,14 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, set_buffer_mapped(bh); - kaddr = kmap_local_page(bh->b_page); - memset(kaddr + bh_offset(bh), 0, i_blocksize(inode)); + /* Initialize block (block size > PAGE_SIZE not yet supported) */ + from = kmap_local_folio(folio, offset_in_folio(folio, bh->b_data)); + memset(from, 0, bh->b_size); if (init_block) - init_block(inode, bh, kaddr); - flush_dcache_page(bh->b_page); - kunmap_local(kaddr); + init_block(inode, bh, from); + kunmap_local(from); + + flush_dcache_folio(folio); set_buffer_uptodate(bh); mark_buffer_dirty(bh); @@ -571,7 +574,8 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh) if (!bh_frozen) bh_frozen = create_empty_buffers(folio, 1 << blkbits, 0); - bh_frozen = get_nth_bh(bh_frozen, bh_offset(bh) >> blkbits); + bh_frozen = get_nth_bh(bh_frozen, + offset_in_folio(folio, bh->b_data) >> blkbits); if (!buffer_uptodate(bh_frozen)) nilfs_copy_buffer(bh_frozen, bh); @@ -601,7 +605,8 @@ nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh) if (!IS_ERR(folio)) { bh_frozen = folio_buffers(folio); if (bh_frozen) { - n = bh_offset(bh) >> inode->i_blkbits; + n = offset_in_folio(folio, bh->b_data) >> + inode->i_blkbits; bh_frozen = get_nth_bh(bh_frozen, n); } folio_unlock(folio); From 832acfe6ea0365524a35df1e9b1d7350ed9ea5f5 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 24 Oct 2024 18:25:37 +0900 Subject: [PATCH 47/71] nilfs2: convert segment usage file to be folio-based For the sufile, which is a metadata file that holds information about managing segments, convert the page-based implementation to a folio-based implementation. kmap_local_page() is changed to use kmap_local_folio(), and where offsets within a page are calculated using bh_offset(), are replaced with calculations using offset_in_folio() with an additional helper function nilfs_sufile_segment_usage_offset(). Link: https://lkml.kernel.org/r/20241024092602.13395-4-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/nilfs2/sufile.c | 160 +++++++++++++++++++++++---------------------- 1 file changed, 82 insertions(+), 78 deletions(-) diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index eea5a6a12f7b..d3ecc813d633 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -70,11 +70,20 @@ nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr, max - curr + 1); } -static struct nilfs_segment_usage * -nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum, - struct buffer_head *bh, void *kaddr) +/** + * nilfs_sufile_segment_usage_offset - calculate the byte offset of a segment + * usage entry in the folio containing it + * @sufile: segment usage file inode + * @segnum: number of segment usage + * @bh: buffer head of block containing segment usage indexed by @segnum + * + * Return: Byte offset in the folio of the segment usage entry. + */ +static size_t nilfs_sufile_segment_usage_offset(const struct inode *sufile, + __u64 segnum, + struct buffer_head *bh) { - return kaddr + bh_offset(bh) + + return offset_in_folio(bh->b_folio, bh->b_data) + nilfs_sufile_get_offset(sufile, segnum) * NILFS_MDT(sufile)->mi_entry_size; } @@ -112,13 +121,11 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh, u64 ncleanadd, u64 ndirtyadd) { struct nilfs_sufile_header *header; - void *kaddr; - kaddr = kmap_local_page(header_bh->b_page); - header = kaddr + bh_offset(header_bh); + header = kmap_local_folio(header_bh->b_folio, 0); le64_add_cpu(&header->sh_ncleansegs, ncleanadd); le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd); - kunmap_local(kaddr); + kunmap_local(header); mark_buffer_dirty(header_bh); } @@ -313,6 +320,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) struct nilfs_sufile_info *sui = NILFS_SUI(sufile); size_t susz = NILFS_MDT(sufile)->mi_entry_size; __u64 segnum, maxsegnum, last_alloc; + size_t offset; void *kaddr; unsigned long nsegments, nsus, cnt; int ret, j; @@ -322,10 +330,9 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) ret = nilfs_sufile_get_header_block(sufile, &header_bh); if (ret < 0) goto out_sem; - kaddr = kmap_local_page(header_bh->b_page); - header = kaddr + bh_offset(header_bh); + header = kmap_local_folio(header_bh->b_folio, 0); last_alloc = le64_to_cpu(header->sh_last_alloc); - kunmap_local(kaddr); + kunmap_local(header); nsegments = nilfs_sufile_get_nsegments(sufile); maxsegnum = sui->allocmax; @@ -359,9 +366,10 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) &su_bh); if (ret < 0) goto out_header; - kaddr = kmap_local_page(su_bh->b_page); - su = nilfs_sufile_block_get_segment_usage( - sufile, segnum, su_bh, kaddr); + + offset = nilfs_sufile_segment_usage_offset(sufile, segnum, + su_bh); + su = kaddr = kmap_local_folio(su_bh->b_folio, offset); nsus = nilfs_sufile_segment_usages_in_block( sufile, segnum, maxsegnum); @@ -372,12 +380,11 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) nilfs_segment_usage_set_dirty(su); kunmap_local(kaddr); - kaddr = kmap_local_page(header_bh->b_page); - header = kaddr + bh_offset(header_bh); + header = kmap_local_folio(header_bh->b_folio, 0); le64_add_cpu(&header->sh_ncleansegs, -1); le64_add_cpu(&header->sh_ndirtysegs, 1); header->sh_last_alloc = cpu_to_le64(segnum); - kunmap_local(kaddr); + kunmap_local(header); sui->ncleansegs--; mark_buffer_dirty(header_bh); @@ -411,18 +418,18 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum, struct buffer_head *su_bh) { struct nilfs_segment_usage *su; - void *kaddr; + size_t offset; - kaddr = kmap_local_page(su_bh->b_page); - su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); + offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh); + su = kmap_local_folio(su_bh->b_folio, offset); if (unlikely(!nilfs_segment_usage_clean(su))) { nilfs_warn(sufile->i_sb, "%s: segment %llu must be clean", __func__, (unsigned long long)segnum); - kunmap_local(kaddr); + kunmap_local(su); return; } nilfs_segment_usage_set_dirty(su); - kunmap_local(kaddr); + kunmap_local(su); nilfs_sufile_mod_counter(header_bh, -1, 1); NILFS_SUI(sufile)->ncleansegs--; @@ -436,14 +443,14 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum, struct buffer_head *su_bh) { struct nilfs_segment_usage *su; - void *kaddr; + size_t offset; int clean, dirty; - kaddr = kmap_local_page(su_bh->b_page); - su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); + offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh); + su = kmap_local_folio(su_bh->b_folio, offset); if (su->su_flags == cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY)) && su->su_nblocks == cpu_to_le32(0)) { - kunmap_local(kaddr); + kunmap_local(su); return; } clean = nilfs_segment_usage_clean(su); @@ -453,7 +460,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum, su->su_lastmod = cpu_to_le64(0); su->su_nblocks = cpu_to_le32(0); su->su_flags = cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY)); - kunmap_local(kaddr); + kunmap_local(su); nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1); NILFS_SUI(sufile)->ncleansegs -= clean; @@ -467,15 +474,15 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, struct buffer_head *su_bh) { struct nilfs_segment_usage *su; - void *kaddr; + size_t offset; int sudirty; - kaddr = kmap_local_page(su_bh->b_page); - su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); + offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh); + su = kmap_local_folio(su_bh->b_folio, offset); if (nilfs_segment_usage_clean(su)) { nilfs_warn(sufile->i_sb, "%s: segment %llu is already clean", __func__, (unsigned long long)segnum); - kunmap_local(kaddr); + kunmap_local(su); return; } if (unlikely(nilfs_segment_usage_error(su))) @@ -488,7 +495,7 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, (unsigned long long)segnum); nilfs_segment_usage_set_clean(su); - kunmap_local(kaddr); + kunmap_local(su); mark_buffer_dirty(su_bh); nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0); @@ -507,7 +514,7 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) { struct buffer_head *bh; - void *kaddr; + size_t offset; struct nilfs_segment_usage *su; int ret; @@ -523,12 +530,12 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) goto out_sem; } - kaddr = kmap_local_page(bh->b_page); - su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); + offset = nilfs_sufile_segment_usage_offset(sufile, segnum, bh); + su = kmap_local_folio(bh->b_folio, offset); if (unlikely(nilfs_segment_usage_error(su))) { struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; - kunmap_local(kaddr); + kunmap_local(su); brelse(bh); if (nilfs_segment_is_active(nilfs, segnum)) { nilfs_error(sufile->i_sb, @@ -546,7 +553,7 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) ret = -EIO; } else { nilfs_segment_usage_set_dirty(su); - kunmap_local(kaddr); + kunmap_local(su); mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(sufile); brelse(bh); @@ -568,7 +575,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, { struct buffer_head *bh; struct nilfs_segment_usage *su; - void *kaddr; + size_t offset; int ret; down_write(&NILFS_MDT(sufile)->mi_sem); @@ -576,8 +583,8 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, if (ret < 0) goto out_sem; - kaddr = kmap_local_page(bh->b_page); - su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); + offset = nilfs_sufile_segment_usage_offset(sufile, segnum, bh); + su = kmap_local_folio(bh->b_folio, offset); if (modtime) { /* * Check segusage error and set su_lastmod only when updating @@ -587,7 +594,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, su->su_lastmod = cpu_to_le64(modtime); } su->su_nblocks = cpu_to_le32(nblocks); - kunmap_local(kaddr); + kunmap_local(su); mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(sufile); @@ -619,7 +626,6 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat) struct buffer_head *header_bh; struct nilfs_sufile_header *header; struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; - void *kaddr; int ret; down_read(&NILFS_MDT(sufile)->mi_sem); @@ -628,8 +634,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat) if (ret < 0) goto out_sem; - kaddr = kmap_local_page(header_bh->b_page); - header = kaddr + bh_offset(header_bh); + header = kmap_local_folio(header_bh->b_folio, 0); sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile); sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs); sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs); @@ -638,7 +643,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat) spin_lock(&nilfs->ns_last_segment_lock); sustat->ss_prot_seq = nilfs->ns_prot_seq; spin_unlock(&nilfs->ns_last_segment_lock); - kunmap_local(kaddr); + kunmap_local(header); brelse(header_bh); out_sem: @@ -651,18 +656,18 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum, struct buffer_head *su_bh) { struct nilfs_segment_usage *su; - void *kaddr; + size_t offset; int suclean; - kaddr = kmap_local_page(su_bh->b_page); - su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); + offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh); + su = kmap_local_folio(su_bh->b_folio, offset); if (nilfs_segment_usage_error(su)) { - kunmap_local(kaddr); + kunmap_local(su); return; } suclean = nilfs_segment_usage_clean(su); nilfs_segment_usage_set_error(su); - kunmap_local(kaddr); + kunmap_local(su); if (suclean) { nilfs_sufile_mod_counter(header_bh, -1, 0); @@ -700,7 +705,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile, unsigned long segusages_per_block; unsigned long nsegs, ncleaned; __u64 segnum; - void *kaddr; + size_t offset; ssize_t n, nc; int ret; int j; @@ -731,16 +736,16 @@ static int nilfs_sufile_truncate_range(struct inode *sufile, /* hole */ continue; } - kaddr = kmap_local_page(su_bh->b_page); - su = nilfs_sufile_block_get_segment_usage( - sufile, segnum, su_bh, kaddr); + offset = nilfs_sufile_segment_usage_offset(sufile, segnum, + su_bh); + su = kmap_local_folio(su_bh->b_folio, offset); su2 = su; for (j = 0; j < n; j++, su = (void *)su + susz) { if ((le32_to_cpu(su->su_flags) & ~BIT(NILFS_SEGMENT_USAGE_ERROR)) || nilfs_segment_is_active(nilfs, segnum + j)) { ret = -EBUSY; - kunmap_local(kaddr); + kunmap_local(su2); brelse(su_bh); goto out_header; } @@ -752,7 +757,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile, nc++; } } - kunmap_local(kaddr); + kunmap_local(su2); if (nc > 0) { mark_buffer_dirty(su_bh); ncleaned += nc; @@ -799,7 +804,6 @@ int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs) struct buffer_head *header_bh; struct nilfs_sufile_header *header; struct nilfs_sufile_info *sui = NILFS_SUI(sufile); - void *kaddr; unsigned long nsegs, nrsvsegs; int ret = 0; @@ -837,10 +841,9 @@ int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs) sui->allocmin = 0; } - kaddr = kmap_local_page(header_bh->b_page); - header = kaddr + bh_offset(header_bh); + header = kmap_local_folio(header_bh->b_folio, 0); header->sh_ncleansegs = cpu_to_le64(sui->ncleansegs); - kunmap_local(kaddr); + kunmap_local(header); mark_buffer_dirty(header_bh); nilfs_mdt_mark_dirty(sufile); @@ -874,6 +877,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, struct nilfs_suinfo *si = buf; size_t susz = NILFS_MDT(sufile)->mi_entry_size; struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; + size_t offset; void *kaddr; unsigned long nsegs, segusages_per_block; ssize_t n; @@ -901,9 +905,9 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, continue; } - kaddr = kmap_local_page(su_bh->b_page); - su = nilfs_sufile_block_get_segment_usage( - sufile, segnum, su_bh, kaddr); + offset = nilfs_sufile_segment_usage_offset(sufile, segnum, + su_bh); + su = kaddr = kmap_local_folio(su_bh->b_folio, offset); for (j = 0; j < n; j++, su = (void *)su + susz, si = (void *)si + sisz) { si->sui_lastmod = le64_to_cpu(su->su_lastmod); @@ -951,7 +955,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, struct buffer_head *header_bh, *bh; struct nilfs_suinfo_update *sup, *supend = buf + supsz * nsup; struct nilfs_segment_usage *su; - void *kaddr; + size_t offset; unsigned long blkoff, prev_blkoff; int cleansi, cleansu, dirtysi, dirtysu; long ncleaned = 0, ndirtied = 0; @@ -983,9 +987,9 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, goto out_header; for (;;) { - kaddr = kmap_local_page(bh->b_page); - su = nilfs_sufile_block_get_segment_usage( - sufile, sup->sup_segnum, bh, kaddr); + offset = nilfs_sufile_segment_usage_offset( + sufile, sup->sup_segnum, bh); + su = kmap_local_folio(bh->b_folio, offset); if (nilfs_suinfo_update_lastmod(sup)) su->su_lastmod = cpu_to_le64(sup->sup_sui.sui_lastmod); @@ -1020,7 +1024,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, su->su_flags = cpu_to_le32(sup->sup_sui.sui_flags); } - kunmap_local(kaddr); + kunmap_local(su); sup = (void *)sup + supsz; if (sup >= supend) @@ -1076,6 +1080,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; struct buffer_head *su_bh; struct nilfs_segment_usage *su; + size_t offset; void *kaddr; size_t n, i, susz = NILFS_MDT(sufile)->mi_entry_size; sector_t seg_start, seg_end, start_block, end_block; @@ -1125,9 +1130,9 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) continue; } - kaddr = kmap_local_page(su_bh->b_page); - su = nilfs_sufile_block_get_segment_usage(sufile, segnum, - su_bh, kaddr); + offset = nilfs_sufile_segment_usage_offset(sufile, segnum, + su_bh); + su = kaddr = kmap_local_folio(su_bh->b_folio, offset); for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) { if (!nilfs_segment_usage_clean(su)) continue; @@ -1167,9 +1172,10 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) } ndiscarded += nblocks; - kaddr = kmap_local_page(su_bh->b_page); - su = nilfs_sufile_block_get_segment_usage( - sufile, segnum, su_bh, kaddr); + offset = nilfs_sufile_segment_usage_offset( + sufile, segnum, su_bh); + su = kaddr = kmap_local_folio(su_bh->b_folio, + offset); } /* start new extent */ @@ -1221,7 +1227,6 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize, struct nilfs_sufile_info *sui; struct buffer_head *header_bh; struct nilfs_sufile_header *header; - void *kaddr; int err; if (susize > sb->s_blocksize) { @@ -1262,10 +1267,9 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize, } sui = NILFS_SUI(sufile); - kaddr = kmap_local_page(header_bh->b_page); - header = kaddr + bh_offset(header_bh); + header = kmap_local_folio(header_bh->b_folio, 0); sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs); - kunmap_local(kaddr); + kunmap_local(header); brelse(header_bh); sui->allocmax = nilfs_sufile_get_nsegments(sufile) - 1; From 21cf934eed5c82994ce570b43b3a3ed049e4275a Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 24 Oct 2024 18:25:38 +0900 Subject: [PATCH 48/71] nilfs2: convert persistent object allocator to be folio-based Regarding the persistent oject allocator, a common mechanism for allocating objects in metadata files such as inodes and DAT entries, convert the page-based implementation to a folio-based implementation. In this conversion, helper functions nilfs_palloc_group_desc_offset() and nilfs_palloc_bitmap_offset() are added and used to calculate the byte offset within a folio of a group descriptor structure and bitmap, respectively, to replace kmap_local_page with kmap_local_folio. In addition, a helper function called nilfs_palloc_entry_offset() is provided to facilitate common calculation of the byte offset within a folio of metadata file entries managed in the persistent object allocator format. Link: https://lkml.kernel.org/r/20241024092602.13395-5-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/nilfs2/alloc.c | 137 +++++++++++++++++++++++++++++----------------- fs/nilfs2/alloc.h | 2 + 2 files changed, 89 insertions(+), 50 deletions(-) diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index d30dfed707b6..5e0a6bd3e015 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -339,19 +339,55 @@ static int nilfs_palloc_delete_entry_block(struct inode *inode, __u64 nr) } /** - * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor + * nilfs_palloc_group_desc_offset - calculate the byte offset of a group + * descriptor in the folio containing it * @inode: inode of metadata file using this allocator * @group: group number - * @bh: buffer head of the buffer storing the group descriptor block - * @kaddr: kernel address mapped for the page including the buffer + * @bh: buffer head of the group descriptor block + * + * Return: Byte offset in the folio of the group descriptor for @group. */ -static struct nilfs_palloc_group_desc * -nilfs_palloc_block_get_group_desc(const struct inode *inode, - unsigned long group, - const struct buffer_head *bh, void *kaddr) +static size_t nilfs_palloc_group_desc_offset(const struct inode *inode, + unsigned long group, + const struct buffer_head *bh) { - return (struct nilfs_palloc_group_desc *)(kaddr + bh_offset(bh)) + - group % nilfs_palloc_groups_per_desc_block(inode); + return offset_in_folio(bh->b_folio, bh->b_data) + + sizeof(struct nilfs_palloc_group_desc) * + (group % nilfs_palloc_groups_per_desc_block(inode)); +} + +/** + * nilfs_palloc_bitmap_offset - calculate the byte offset of a bitmap block + * in the folio containing it + * @bh: buffer head of the bitmap block + * + * Return: Byte offset in the folio of the bitmap block for @bh. + */ +static size_t nilfs_palloc_bitmap_offset(const struct buffer_head *bh) +{ + return offset_in_folio(bh->b_folio, bh->b_data); +} + +/** + * nilfs_palloc_entry_offset - calculate the byte offset of an entry in the + * folio containing it + * @inode: inode of metadata file using this allocator + * @nr: serial number of the entry (e.g. inode number) + * @bh: buffer head of the entry block + * + * Return: Byte offset in the folio of the entry @nr. + */ +size_t nilfs_palloc_entry_offset(const struct inode *inode, __u64 nr, + const struct buffer_head *bh) +{ + unsigned long entry_index_in_group, entry_index_in_block; + + nilfs_palloc_group(inode, nr, &entry_index_in_group); + entry_index_in_block = entry_index_in_group % + NILFS_MDT(inode)->mi_entries_per_block; + + return offset_in_folio(bh->b_folio, bh->b_data) + + entry_index_in_block * NILFS_MDT(inode)->mi_entry_size; } /** @@ -508,7 +544,7 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, struct buffer_head *desc_bh, *bitmap_bh; struct nilfs_palloc_group_desc *desc; unsigned char *bitmap; - void *desc_kaddr, *bitmap_kaddr; + size_t doff, boff; unsigned long group, maxgroup, ngroups; unsigned long group_offset, maxgroup_offset; unsigned long n, entries_per_group; @@ -531,17 +567,17 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh); if (ret < 0) return ret; - desc_kaddr = kmap_local_page(desc_bh->b_page); - desc = nilfs_palloc_block_get_group_desc( - inode, group, desc_bh, desc_kaddr); + + doff = nilfs_palloc_group_desc_offset(inode, group, desc_bh); + desc = kmap_local_folio(desc_bh->b_folio, doff); n = nilfs_palloc_rest_groups_in_desc_block(inode, group, maxgroup); - for (j = 0; j < n; j++, desc++, group++, group_offset = 0) { + for (j = 0; j < n; j++, group++, group_offset = 0) { lock = nilfs_mdt_bgl_lock(inode, group); - if (nilfs_palloc_group_desc_nfrees(desc, lock) == 0) + if (nilfs_palloc_group_desc_nfrees(&desc[j], lock) == 0) continue; - kunmap_local(desc_kaddr); + kunmap_local(desc); ret = nilfs_palloc_get_bitmap_block(inode, group, 1, &bitmap_bh); if (unlikely(ret < 0)) { @@ -549,12 +585,14 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, return ret; } - desc_kaddr = kmap_local_page(desc_bh->b_page); - desc = nilfs_palloc_block_get_group_desc( - inode, group, desc_bh, desc_kaddr); + /* + * Re-kmap the folio containing the first (and + * subsequent) group descriptors. + */ + desc = kmap_local_folio(desc_bh->b_folio, doff); - bitmap_kaddr = kmap_local_page(bitmap_bh->b_page); - bitmap = bitmap_kaddr + bh_offset(bitmap_bh); + boff = nilfs_palloc_bitmap_offset(bitmap_bh); + bitmap = kmap_local_folio(bitmap_bh->b_folio, boff); pos = nilfs_palloc_find_available_slot( bitmap, group_offset, entries_per_group, lock, wrap); @@ -564,14 +602,14 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, * beginning, the wrap flag only has an effect on the * first search. */ - kunmap_local(bitmap_kaddr); + kunmap_local(bitmap); if (pos >= 0) goto found; brelse(bitmap_bh); } - kunmap_local(desc_kaddr); + kunmap_local(desc); brelse(desc_bh); } @@ -580,9 +618,9 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, found: /* found a free entry */ - nilfs_palloc_group_desc_add_entries(desc, lock, -1); + nilfs_palloc_group_desc_add_entries(&desc[j], lock, -1); req->pr_entry_nr = entries_per_group * group + pos; - kunmap_local(desc_kaddr); + kunmap_local(desc); req->pr_desc_bh = desc_bh; req->pr_bitmap_bh = bitmap_bh; @@ -613,18 +651,18 @@ void nilfs_palloc_commit_alloc_entry(struct inode *inode, void nilfs_palloc_commit_free_entry(struct inode *inode, struct nilfs_palloc_req *req) { - struct nilfs_palloc_group_desc *desc; unsigned long group, group_offset; + size_t doff, boff; + struct nilfs_palloc_group_desc *desc; unsigned char *bitmap; - void *desc_kaddr, *bitmap_kaddr; spinlock_t *lock; group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); - desc_kaddr = kmap_local_page(req->pr_desc_bh->b_page); - desc = nilfs_palloc_block_get_group_desc(inode, group, - req->pr_desc_bh, desc_kaddr); - bitmap_kaddr = kmap_local_page(req->pr_bitmap_bh->b_page); - bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); + doff = nilfs_palloc_group_desc_offset(inode, group, req->pr_desc_bh); + desc = kmap_local_folio(req->pr_desc_bh->b_folio, doff); + + boff = nilfs_palloc_bitmap_offset(req->pr_bitmap_bh); + bitmap = kmap_local_folio(req->pr_bitmap_bh->b_folio, boff); lock = nilfs_mdt_bgl_lock(inode, group); if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) @@ -635,8 +673,8 @@ void nilfs_palloc_commit_free_entry(struct inode *inode, else nilfs_palloc_group_desc_add_entries(desc, lock, 1); - kunmap_local(bitmap_kaddr); - kunmap_local(desc_kaddr); + kunmap_local(bitmap); + kunmap_local(desc); mark_buffer_dirty(req->pr_desc_bh); mark_buffer_dirty(req->pr_bitmap_bh); @@ -655,17 +693,17 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, struct nilfs_palloc_req *req) { struct nilfs_palloc_group_desc *desc; - void *desc_kaddr, *bitmap_kaddr; + size_t doff, boff; unsigned char *bitmap; unsigned long group, group_offset; spinlock_t *lock; group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); - desc_kaddr = kmap_local_page(req->pr_desc_bh->b_page); - desc = nilfs_palloc_block_get_group_desc(inode, group, - req->pr_desc_bh, desc_kaddr); - bitmap_kaddr = kmap_local_page(req->pr_bitmap_bh->b_page); - bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); + doff = nilfs_palloc_group_desc_offset(inode, group, req->pr_desc_bh); + desc = kmap_local_folio(req->pr_desc_bh->b_folio, doff); + + boff = nilfs_palloc_bitmap_offset(req->pr_bitmap_bh); + bitmap = kmap_local_folio(req->pr_bitmap_bh->b_folio, boff); lock = nilfs_mdt_bgl_lock(inode, group); if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) @@ -676,8 +714,8 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, else nilfs_palloc_group_desc_add_entries(desc, lock, 1); - kunmap_local(bitmap_kaddr); - kunmap_local(desc_kaddr); + kunmap_local(bitmap); + kunmap_local(desc); brelse(req->pr_bitmap_bh); brelse(req->pr_desc_bh); @@ -741,7 +779,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) struct buffer_head *desc_bh, *bitmap_bh; struct nilfs_palloc_group_desc *desc; unsigned char *bitmap; - void *desc_kaddr, *bitmap_kaddr; + size_t doff, boff; unsigned long group, group_offset; __u64 group_min_nr, last_nrs[8]; const unsigned long epg = nilfs_palloc_entries_per_group(inode); @@ -769,8 +807,8 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) /* Get the first entry number of the group */ group_min_nr = (__u64)group * epg; - bitmap_kaddr = kmap_local_page(bitmap_bh->b_page); - bitmap = bitmap_kaddr + bh_offset(bitmap_bh); + boff = nilfs_palloc_bitmap_offset(bitmap_bh); + bitmap = kmap_local_folio(bitmap_bh->b_folio, boff); lock = nilfs_mdt_bgl_lock(inode, group); j = i; @@ -815,7 +853,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) entry_start = rounddown(group_offset, epb); } while (true); - kunmap_local(bitmap_kaddr); + kunmap_local(bitmap); mark_buffer_dirty(bitmap_bh); brelse(bitmap_bh); @@ -829,11 +867,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) inode->i_ino); } - desc_kaddr = kmap_local_page(desc_bh->b_page); - desc = nilfs_palloc_block_get_group_desc( - inode, group, desc_bh, desc_kaddr); + doff = nilfs_palloc_group_desc_offset(inode, group, desc_bh); + desc = kmap_local_folio(desc_bh->b_folio, doff); nfree = nilfs_palloc_group_desc_add_entries(desc, lock, n); - kunmap_local(desc_kaddr); + kunmap_local(desc); mark_buffer_dirty(desc_bh); nilfs_mdt_mark_dirty(inode); brelse(desc_bh); diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h index e19d7eb10084..af8f882619d4 100644 --- a/fs/nilfs2/alloc.h +++ b/fs/nilfs2/alloc.h @@ -33,6 +33,8 @@ int nilfs_palloc_get_entry_block(struct inode *, __u64, int, struct buffer_head **); void *nilfs_palloc_block_get_entry(const struct inode *, __u64, const struct buffer_head *, void *); +size_t nilfs_palloc_entry_offset(const struct inode *inode, __u64 nr, + const struct buffer_head *bh); int nilfs_palloc_count_max_entries(struct inode *, u64, u64 *); From f99de3d5703a92cc18a9a95995b99b8401331bf7 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 24 Oct 2024 18:25:39 +0900 Subject: [PATCH 49/71] nilfs2: convert inode file to be folio-based Convert the page-based implementation of ifile, a metadata file that manages inodes, to folio-based. Link: https://lkml.kernel.org/r/20241024092602.13395-6-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/nilfs2/ifile.c | 10 +++++----- fs/nilfs2/ifile.h | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index 1e86b9303b7c..e7339eb3c08a 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -98,7 +98,7 @@ int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino) .pr_entry_nr = ino, .pr_entry_bh = NULL }; struct nilfs_inode *raw_inode; - void *kaddr; + size_t offset; int ret; ret = nilfs_palloc_prepare_free_entry(ifile, &req); @@ -113,11 +113,11 @@ int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino) return ret; } - kaddr = kmap_local_page(req.pr_entry_bh->b_page); - raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr, - req.pr_entry_bh, kaddr); + offset = nilfs_palloc_entry_offset(ifile, req.pr_entry_nr, + req.pr_entry_bh); + raw_inode = kmap_local_folio(req.pr_entry_bh->b_folio, offset); raw_inode->i_flags = 0; - kunmap_local(kaddr); + kunmap_local(raw_inode); mark_buffer_dirty(req.pr_entry_bh); brelse(req.pr_entry_bh); diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h index 625545cc2a98..5d116a566d9e 100644 --- a/fs/nilfs2/ifile.h +++ b/fs/nilfs2/ifile.h @@ -21,9 +21,9 @@ static inline struct nilfs_inode * nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh) { - void *kaddr = kmap_local_page(ibh->b_page); + size_t __offset_in_folio = nilfs_palloc_entry_offset(ifile, ino, ibh); - return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr); + return kmap_local_folio(ibh->b_folio, __offset_in_folio); } static inline void nilfs_ifile_unmap_inode(struct nilfs_inode *raw_inode) From aac6925e20e0e9476bc906f6bd83b6c508430d5a Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 24 Oct 2024 18:25:40 +0900 Subject: [PATCH 50/71] nilfs2: convert DAT file to be folio-based Regarding the DAT, a metadata file that manages virtual block addresses, convert the page-based implementation to a folio-based implementation. Link: https://lkml.kernel.org/r/20241024092602.13395-7-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/nilfs2/dat.c | 98 ++++++++++++++++++++++++++----------------------- 1 file changed, 52 insertions(+), 46 deletions(-) diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 0bef662176a4..e220dcb08aa6 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -89,15 +89,15 @@ int nilfs_dat_prepare_alloc(struct inode *dat, struct nilfs_palloc_req *req) void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req) { struct nilfs_dat_entry *entry; - void *kaddr; + size_t offset; - kaddr = kmap_local_page(req->pr_entry_bh->b_page); - entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, - req->pr_entry_bh, kaddr); + offset = nilfs_palloc_entry_offset(dat, req->pr_entry_nr, + req->pr_entry_bh); + entry = kmap_local_folio(req->pr_entry_bh->b_folio, offset); entry->de_start = cpu_to_le64(NILFS_CNO_MIN); entry->de_end = cpu_to_le64(NILFS_CNO_MAX); entry->de_blocknr = cpu_to_le64(0); - kunmap_local(kaddr); + kunmap_local(entry); nilfs_palloc_commit_alloc_entry(dat, req); nilfs_dat_commit_entry(dat, req); @@ -113,15 +113,15 @@ static void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req) { struct nilfs_dat_entry *entry; - void *kaddr; + size_t offset; - kaddr = kmap_local_page(req->pr_entry_bh->b_page); - entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, - req->pr_entry_bh, kaddr); + offset = nilfs_palloc_entry_offset(dat, req->pr_entry_nr, + req->pr_entry_bh); + entry = kmap_local_folio(req->pr_entry_bh->b_folio, offset); entry->de_start = cpu_to_le64(NILFS_CNO_MIN); entry->de_end = cpu_to_le64(NILFS_CNO_MIN); entry->de_blocknr = cpu_to_le64(0); - kunmap_local(kaddr); + kunmap_local(entry); nilfs_dat_commit_entry(dat, req); @@ -143,14 +143,14 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req, sector_t blocknr) { struct nilfs_dat_entry *entry; - void *kaddr; + size_t offset; - kaddr = kmap_local_page(req->pr_entry_bh->b_page); - entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, - req->pr_entry_bh, kaddr); + offset = nilfs_palloc_entry_offset(dat, req->pr_entry_nr, + req->pr_entry_bh); + entry = kmap_local_folio(req->pr_entry_bh->b_folio, offset); entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat)); entry->de_blocknr = cpu_to_le64(blocknr); - kunmap_local(kaddr); + kunmap_local(entry); nilfs_dat_commit_entry(dat, req); } @@ -160,19 +160,19 @@ int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) struct nilfs_dat_entry *entry; __u64 start; sector_t blocknr; - void *kaddr; + size_t offset; int ret; ret = nilfs_dat_prepare_entry(dat, req, 0); if (ret < 0) return ret; - kaddr = kmap_local_page(req->pr_entry_bh->b_page); - entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, - req->pr_entry_bh, kaddr); + offset = nilfs_palloc_entry_offset(dat, req->pr_entry_nr, + req->pr_entry_bh); + entry = kmap_local_folio(req->pr_entry_bh->b_folio, offset); start = le64_to_cpu(entry->de_start); blocknr = le64_to_cpu(entry->de_blocknr); - kunmap_local(kaddr); + kunmap_local(entry); if (blocknr == 0) { ret = nilfs_palloc_prepare_free_entry(dat, req); @@ -200,11 +200,11 @@ void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req, struct nilfs_dat_entry *entry; __u64 start, end; sector_t blocknr; - void *kaddr; + size_t offset; - kaddr = kmap_local_page(req->pr_entry_bh->b_page); - entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, - req->pr_entry_bh, kaddr); + offset = nilfs_palloc_entry_offset(dat, req->pr_entry_nr, + req->pr_entry_bh); + entry = kmap_local_folio(req->pr_entry_bh->b_folio, offset); end = start = le64_to_cpu(entry->de_start); if (!dead) { end = nilfs_mdt_cno(dat); @@ -212,7 +212,7 @@ void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req, } entry->de_end = cpu_to_le64(end); blocknr = le64_to_cpu(entry->de_blocknr); - kunmap_local(kaddr); + kunmap_local(entry); if (blocknr == 0) nilfs_dat_commit_free(dat, req); @@ -225,14 +225,14 @@ void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req) struct nilfs_dat_entry *entry; __u64 start; sector_t blocknr; - void *kaddr; + size_t offset; - kaddr = kmap_local_page(req->pr_entry_bh->b_page); - entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, - req->pr_entry_bh, kaddr); + offset = nilfs_palloc_entry_offset(dat, req->pr_entry_nr, + req->pr_entry_bh); + entry = kmap_local_folio(req->pr_entry_bh->b_folio, offset); start = le64_to_cpu(entry->de_start); blocknr = le64_to_cpu(entry->de_blocknr); - kunmap_local(kaddr); + kunmap_local(entry); if (start == nilfs_mdt_cno(dat) && blocknr == 0) nilfs_palloc_abort_free_entry(dat, req); @@ -336,7 +336,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr) { struct buffer_head *entry_bh; struct nilfs_dat_entry *entry; - void *kaddr; + size_t offset; int ret; ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh); @@ -359,21 +359,21 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr) } } - kaddr = kmap_local_page(entry_bh->b_page); - entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); + offset = nilfs_palloc_entry_offset(dat, vblocknr, entry_bh); + entry = kmap_local_folio(entry_bh->b_folio, offset); if (unlikely(entry->de_blocknr == cpu_to_le64(0))) { nilfs_crit(dat->i_sb, "%s: invalid vblocknr = %llu, [%llu, %llu)", __func__, (unsigned long long)vblocknr, (unsigned long long)le64_to_cpu(entry->de_start), (unsigned long long)le64_to_cpu(entry->de_end)); - kunmap_local(kaddr); + kunmap_local(entry); brelse(entry_bh); return -EINVAL; } WARN_ON(blocknr == 0); entry->de_blocknr = cpu_to_le64(blocknr); - kunmap_local(kaddr); + kunmap_local(entry); mark_buffer_dirty(entry_bh); nilfs_mdt_mark_dirty(dat); @@ -407,7 +407,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp) struct buffer_head *entry_bh, *bh; struct nilfs_dat_entry *entry; sector_t blocknr; - void *kaddr; + size_t offset; int ret; ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh); @@ -423,8 +423,8 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp) } } - kaddr = kmap_local_page(entry_bh->b_page); - entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); + offset = nilfs_palloc_entry_offset(dat, vblocknr, entry_bh); + entry = kmap_local_folio(entry_bh->b_folio, offset); blocknr = le64_to_cpu(entry->de_blocknr); if (blocknr == 0) { ret = -ENOENT; @@ -433,7 +433,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp) *blocknrp = blocknr; out: - kunmap_local(kaddr); + kunmap_local(entry); brelse(entry_bh); return ret; } @@ -442,11 +442,12 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz, size_t nvi) { struct buffer_head *entry_bh; - struct nilfs_dat_entry *entry; + struct nilfs_dat_entry *entry, *first_entry; struct nilfs_vinfo *vinfo = buf; __u64 first, last; - void *kaddr; + size_t offset; unsigned long entries_per_block = NILFS_MDT(dat)->mi_entries_per_block; + unsigned int entry_size = NILFS_MDT(dat)->mi_entry_size; int i, j, n, ret; for (i = 0; i < nvi; i += n) { @@ -454,23 +455,28 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz, 0, &entry_bh); if (ret < 0) return ret; - kaddr = kmap_local_page(entry_bh->b_page); - /* last virtual block number in this block */ + first = vinfo->vi_vblocknr; first = div64_ul(first, entries_per_block); first *= entries_per_block; + /* first virtual block number in this block */ + last = first + entries_per_block - 1; + /* last virtual block number in this block */ + + offset = nilfs_palloc_entry_offset(dat, first, entry_bh); + first_entry = kmap_local_folio(entry_bh->b_folio, offset); for (j = i, n = 0; j < nvi && vinfo->vi_vblocknr >= first && vinfo->vi_vblocknr <= last; j++, n++, vinfo = (void *)vinfo + visz) { - entry = nilfs_palloc_block_get_entry( - dat, vinfo->vi_vblocknr, entry_bh, kaddr); + entry = (void *)first_entry + + (vinfo->vi_vblocknr - first) * entry_size; vinfo->vi_start = le64_to_cpu(entry->de_start); vinfo->vi_end = le64_to_cpu(entry->de_end); vinfo->vi_blocknr = le64_to_cpu(entry->de_blocknr); } - kunmap_local(kaddr); + kunmap_local(first_entry); brelse(entry_bh); } From cdee17960f67d1dad0738ef3ae9f1a63d3c92138 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 24 Oct 2024 18:25:41 +0900 Subject: [PATCH 51/71] nilfs2: remove nilfs_palloc_block_get_entry() All calls to nilfs_palloc_block_get_entry() are now gone, so remove it. Link: https://lkml.kernel.org/r/20241024092602.13395-8-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/nilfs2/alloc.c | 19 ------------------- fs/nilfs2/alloc.h | 2 -- 2 files changed, 21 deletions(-) diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index 5e0a6bd3e015..ba3e1f591f36 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -390,25 +390,6 @@ size_t nilfs_palloc_entry_offset(const struct inode *inode, __u64 nr, entry_index_in_block * NILFS_MDT(inode)->mi_entry_size; } -/** - * nilfs_palloc_block_get_entry - get kernel address of an entry - * @inode: inode of metadata file using this allocator - * @nr: serial number of the entry (e.g. inode number) - * @bh: buffer head of the buffer storing the entry block - * @kaddr: kernel address mapped for the page including the buffer - */ -void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, - const struct buffer_head *bh, void *kaddr) -{ - unsigned long entry_offset, group_offset; - - nilfs_palloc_group(inode, nr, &group_offset); - entry_offset = group_offset % NILFS_MDT(inode)->mi_entries_per_block; - - return kaddr + bh_offset(bh) + - entry_offset * NILFS_MDT(inode)->mi_entry_size; -} - /** * nilfs_palloc_find_available_slot - find available slot in a group * @bitmap: bitmap of the group diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h index af8f882619d4..3f115ab7e9a7 100644 --- a/fs/nilfs2/alloc.h +++ b/fs/nilfs2/alloc.h @@ -31,8 +31,6 @@ nilfs_palloc_entries_per_group(const struct inode *inode) int nilfs_palloc_init_blockgroup(struct inode *, unsigned int); int nilfs_palloc_get_entry_block(struct inode *, __u64, int, struct buffer_head **); -void *nilfs_palloc_block_get_entry(const struct inode *, __u64, - const struct buffer_head *, void *); size_t nilfs_palloc_entry_offset(const struct inode *inode, __u64 nr, const struct buffer_head *bh); From a6cb5b1e9c707c3a43ede691c7faee45e796458b Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 24 Oct 2024 18:25:42 +0900 Subject: [PATCH 52/71] nilfs2: convert checkpoint file to be folio-based Regarding the cpfile, a metadata file that manages checkpoints, convert the page-based implementation to a folio-based implementation. This change involves some helper functions to calculate byte offsets on folios and removing a few helper functions that are no longer needed. Link: https://lkml.kernel.org/r/20241024092602.13395-9-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/nilfs2/cpfile.c | 379 ++++++++++++++++++++++++--------------------- 1 file changed, 204 insertions(+), 175 deletions(-) diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index a8046cbf2753..c20207d7a989 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -68,49 +68,36 @@ static inline int nilfs_cpfile_is_in_first(const struct inode *cpfile, static unsigned int nilfs_cpfile_block_add_valid_checkpoints(const struct inode *cpfile, struct buffer_head *bh, - void *kaddr, unsigned int n) { - struct nilfs_checkpoint *cp = kaddr + bh_offset(bh); + struct nilfs_checkpoint *cp; unsigned int count; + cp = kmap_local_folio(bh->b_folio, + offset_in_folio(bh->b_folio, bh->b_data)); count = le32_to_cpu(cp->cp_checkpoints_count) + n; cp->cp_checkpoints_count = cpu_to_le32(count); + kunmap_local(cp); return count; } static unsigned int nilfs_cpfile_block_sub_valid_checkpoints(const struct inode *cpfile, struct buffer_head *bh, - void *kaddr, unsigned int n) { - struct nilfs_checkpoint *cp = kaddr + bh_offset(bh); + struct nilfs_checkpoint *cp; unsigned int count; + cp = kmap_local_folio(bh->b_folio, + offset_in_folio(bh->b_folio, bh->b_data)); WARN_ON(le32_to_cpu(cp->cp_checkpoints_count) < n); count = le32_to_cpu(cp->cp_checkpoints_count) - n; cp->cp_checkpoints_count = cpu_to_le32(count); + kunmap_local(cp); return count; } -static inline struct nilfs_cpfile_header * -nilfs_cpfile_block_get_header(const struct inode *cpfile, - struct buffer_head *bh, - void *kaddr) -{ - return kaddr + bh_offset(bh); -} - -static struct nilfs_checkpoint * -nilfs_cpfile_block_get_checkpoint(const struct inode *cpfile, __u64 cno, - struct buffer_head *bh, - void *kaddr) -{ - return kaddr + bh_offset(bh) + nilfs_cpfile_get_offset(cpfile, cno) * - NILFS_MDT(cpfile)->mi_entry_size; -} - static void nilfs_cpfile_block_init(struct inode *cpfile, struct buffer_head *bh, void *from) @@ -125,6 +112,54 @@ static void nilfs_cpfile_block_init(struct inode *cpfile, } } +/** + * nilfs_cpfile_checkpoint_offset - calculate the byte offset of a checkpoint + * entry in the folio containing it + * @cpfile: checkpoint file inode + * @cno: checkpoint number + * @bh: buffer head of block containing checkpoint indexed by @cno + * + * Return: Byte offset in the folio of the checkpoint specified by @cno. + */ +static size_t nilfs_cpfile_checkpoint_offset(const struct inode *cpfile, + __u64 cno, + struct buffer_head *bh) +{ + return offset_in_folio(bh->b_folio, bh->b_data) + + nilfs_cpfile_get_offset(cpfile, cno) * + NILFS_MDT(cpfile)->mi_entry_size; +} + +/** + * nilfs_cpfile_cp_snapshot_list_offset - calculate the byte offset of a + * checkpoint snapshot list in the folio + * containing it + * @cpfile: checkpoint file inode + * @cno: checkpoint number + * @bh: buffer head of block containing checkpoint indexed by @cno + * + * Return: Byte offset in the folio of the checkpoint snapshot list specified + * by @cno. + */ +static size_t nilfs_cpfile_cp_snapshot_list_offset(const struct inode *cpfile, + __u64 cno, + struct buffer_head *bh) +{ + return nilfs_cpfile_checkpoint_offset(cpfile, cno, bh) + + offsetof(struct nilfs_checkpoint, cp_snapshot_list); +} + +/** + * nilfs_cpfile_ch_snapshot_list_offset - calculate the byte offset of the + * snapshot list in the header + * + * Return: Byte offset in the folio of the checkpoint snapshot list + */ +static size_t nilfs_cpfile_ch_snapshot_list_offset(void) +{ + return offsetof(struct nilfs_cpfile_header, ch_snapshot_list); +} + static int nilfs_cpfile_get_header_block(struct inode *cpfile, struct buffer_head **bhp) { @@ -214,7 +249,7 @@ int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno, { struct buffer_head *cp_bh; struct nilfs_checkpoint *cp; - void *kaddr; + size_t offset; int ret; if (cno < 1 || cno > nilfs_mdt_cno(cpfile)) @@ -228,8 +263,8 @@ int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno, goto out_sem; } - kaddr = kmap_local_page(cp_bh->b_page); - cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, cp_bh); + cp = kmap_local_folio(cp_bh->b_folio, offset); if (nilfs_checkpoint_invalid(cp)) { ret = -EINVAL; goto put_cp; @@ -254,7 +289,7 @@ int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno, root->ifile = ifile; put_cp: - kunmap_local(kaddr); + kunmap_local(cp); brelse(cp_bh); out_sem: up_read(&NILFS_MDT(cpfile)->mi_sem); @@ -282,7 +317,7 @@ int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno) struct buffer_head *header_bh, *cp_bh; struct nilfs_cpfile_header *header; struct nilfs_checkpoint *cp; - void *kaddr; + size_t offset; int ret; if (WARN_ON_ONCE(cno < 1)) @@ -297,24 +332,22 @@ int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno) if (unlikely(ret < 0)) goto out_header; - kaddr = kmap_local_page(cp_bh->b_page); - cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, cp_bh); + cp = kmap_local_folio(cp_bh->b_folio, offset); if (nilfs_checkpoint_invalid(cp)) { /* a newly-created checkpoint */ nilfs_checkpoint_clear_invalid(cp); + kunmap_local(cp); if (!nilfs_cpfile_is_in_first(cpfile, cno)) nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh, - kaddr, 1); - kunmap_local(kaddr); + 1); - kaddr = kmap_local_page(header_bh->b_page); - header = nilfs_cpfile_block_get_header(cpfile, header_bh, - kaddr); + header = kmap_local_folio(header_bh->b_folio, 0); le64_add_cpu(&header->ch_ncheckpoints, 1); - kunmap_local(kaddr); + kunmap_local(header); mark_buffer_dirty(header_bh); } else { - kunmap_local(kaddr); + kunmap_local(cp); } /* Force the buffer and the inode to become dirty */ @@ -353,7 +386,7 @@ int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno, { struct buffer_head *cp_bh; struct nilfs_checkpoint *cp; - void *kaddr; + size_t offset; int ret; if (WARN_ON_ONCE(cno < 1)) @@ -367,10 +400,10 @@ int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno, goto out_sem; } - kaddr = kmap_local_page(cp_bh->b_page); - cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, cp_bh); + cp = kmap_local_folio(cp_bh->b_folio, offset); if (unlikely(nilfs_checkpoint_invalid(cp))) { - kunmap_local(kaddr); + kunmap_local(cp); brelse(cp_bh); goto error; } @@ -391,7 +424,7 @@ int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno, nilfs_write_inode_common(root->ifile, &cp->cp_ifile_inode); nilfs_bmap_write(NILFS_I(root->ifile)->i_bmap, &cp->cp_ifile_inode); - kunmap_local(kaddr); + kunmap_local(cp); brelse(cp_bh); out_sem: up_write(&NILFS_MDT(cpfile)->mi_sem); @@ -432,6 +465,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, struct nilfs_checkpoint *cp; size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; __u64 cno; + size_t offset; void *kaddr; unsigned long tnicps; int ret, ncps, nicps, nss, count, i; @@ -462,9 +496,8 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, continue; } - kaddr = kmap_local_page(cp_bh->b_page); - cp = nilfs_cpfile_block_get_checkpoint( - cpfile, cno, cp_bh, kaddr); + offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, cp_bh); + cp = kaddr = kmap_local_folio(cp_bh->b_folio, offset); nicps = 0; for (i = 0; i < ncps; i++, cp = (void *)cp + cpsz) { if (nilfs_checkpoint_snapshot(cp)) { @@ -474,43 +507,42 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, nicps++; } } - if (nicps > 0) { - tnicps += nicps; - mark_buffer_dirty(cp_bh); - nilfs_mdt_mark_dirty(cpfile); - if (!nilfs_cpfile_is_in_first(cpfile, cno)) { - count = - nilfs_cpfile_block_sub_valid_checkpoints( - cpfile, cp_bh, kaddr, nicps); - if (count == 0) { - /* make hole */ - kunmap_local(kaddr); - brelse(cp_bh); - ret = - nilfs_cpfile_delete_checkpoint_block( - cpfile, cno); - if (ret == 0) - continue; - nilfs_err(cpfile->i_sb, - "error %d deleting checkpoint block", - ret); - break; - } - } + kunmap_local(kaddr); + + if (nicps <= 0) { + brelse(cp_bh); + continue; } - kunmap_local(kaddr); + tnicps += nicps; + mark_buffer_dirty(cp_bh); + nilfs_mdt_mark_dirty(cpfile); + if (nilfs_cpfile_is_in_first(cpfile, cno)) { + brelse(cp_bh); + continue; + } + + count = nilfs_cpfile_block_sub_valid_checkpoints(cpfile, cp_bh, + nicps); brelse(cp_bh); + if (count) + continue; + + /* Delete the block if there are no more valid checkpoints */ + ret = nilfs_cpfile_delete_checkpoint_block(cpfile, cno); + if (unlikely(ret)) { + nilfs_err(cpfile->i_sb, + "error %d deleting checkpoint block", ret); + break; + } } if (tnicps > 0) { - kaddr = kmap_local_page(header_bh->b_page); - header = nilfs_cpfile_block_get_header(cpfile, header_bh, - kaddr); + header = kmap_local_folio(header_bh->b_folio, 0); le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps); mark_buffer_dirty(header_bh); nilfs_mdt_mark_dirty(cpfile); - kunmap_local(kaddr); + kunmap_local(header); } brelse(header_bh); @@ -544,6 +576,7 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop, struct buffer_head *bh; size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; __u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop; + size_t offset; void *kaddr; int n, ret; int ncps, i; @@ -562,8 +595,8 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop, } ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno); - kaddr = kmap_local_page(bh->b_page); - cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); + offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, bh); + cp = kaddr = kmap_local_folio(bh->b_folio, offset); for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) { if (!nilfs_checkpoint_invalid(cp)) { nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, @@ -597,7 +630,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, struct nilfs_cpinfo *ci = buf; __u64 curr = *cnop, next; unsigned long curr_blkoff, next_blkoff; - void *kaddr; + size_t offset; int n = 0, ret; down_read(&NILFS_MDT(cpfile)->mi_sem); @@ -606,10 +639,9 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, ret = nilfs_cpfile_get_header_block(cpfile, &bh); if (ret < 0) goto out; - kaddr = kmap_local_page(bh->b_page); - header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); + header = kmap_local_folio(bh->b_folio, 0); curr = le64_to_cpu(header->ch_snapshot_list.ssl_next); - kunmap_local(kaddr); + kunmap_local(header); brelse(bh); if (curr == 0) { ret = 0; @@ -627,9 +659,9 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, ret = 0; /* No snapshots (started from a hole block) */ goto out; } - kaddr = kmap_local_page(bh->b_page); + offset = nilfs_cpfile_checkpoint_offset(cpfile, curr, bh); + cp = kmap_local_folio(bh->b_folio, offset); while (n < nci) { - cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr); curr = ~(__u64)0; /* Terminator */ if (unlikely(nilfs_checkpoint_invalid(cp) || !nilfs_checkpoint_snapshot(cp))) @@ -641,9 +673,9 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, if (next == 0) break; /* reach end of the snapshot list */ + kunmap_local(cp); next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next); if (curr_blkoff != next_blkoff) { - kunmap_local(kaddr); brelse(bh); ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, 0, &bh); @@ -651,12 +683,13 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, WARN_ON(ret == -ENOENT); goto out; } - kaddr = kmap_local_page(bh->b_page); } + offset = nilfs_cpfile_checkpoint_offset(cpfile, next, bh); + cp = kmap_local_folio(bh->b_folio, offset); curr = next; curr_blkoff = next_blkoff; } - kunmap_local(kaddr); + kunmap_local(cp); brelse(bh); *cnop = curr; ret = n; @@ -733,26 +766,6 @@ int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno) return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1); } -static struct nilfs_snapshot_list * -nilfs_cpfile_block_get_snapshot_list(const struct inode *cpfile, - __u64 cno, - struct buffer_head *bh, - void *kaddr) -{ - struct nilfs_cpfile_header *header; - struct nilfs_checkpoint *cp; - struct nilfs_snapshot_list *list; - - if (cno != 0) { - cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); - list = &cp->cp_snapshot_list; - } else { - header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); - list = &header->ch_snapshot_list; - } - return list; -} - static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno) { struct buffer_head *header_bh, *curr_bh, *prev_bh, *cp_bh; @@ -761,94 +774,103 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno) struct nilfs_snapshot_list *list; __u64 curr, prev; unsigned long curr_blkoff, prev_blkoff; - void *kaddr; + size_t offset, curr_list_offset, prev_list_offset; int ret; if (cno == 0) return -ENOENT; /* checkpoint number 0 is invalid */ down_write(&NILFS_MDT(cpfile)->mi_sem); + ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); + if (unlikely(ret < 0)) + goto out_sem; + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); if (ret < 0) - goto out_sem; - kaddr = kmap_local_page(cp_bh->b_page); - cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + goto out_header; + + offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, cp_bh); + cp = kmap_local_folio(cp_bh->b_folio, offset); if (nilfs_checkpoint_invalid(cp)) { ret = -ENOENT; - kunmap_local(kaddr); + kunmap_local(cp); goto out_cp; } if (nilfs_checkpoint_snapshot(cp)) { ret = 0; - kunmap_local(kaddr); + kunmap_local(cp); goto out_cp; } - kunmap_local(kaddr); + kunmap_local(cp); - ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); - if (ret < 0) - goto out_cp; - kaddr = kmap_local_page(header_bh->b_page); - header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); + /* + * Find the last snapshot before the checkpoint being changed to + * snapshot mode by going backwards through the snapshot list. + * Set "prev" to its checkpoint number, or 0 if not found. + */ + header = kmap_local_folio(header_bh->b_folio, 0); list = &header->ch_snapshot_list; curr_bh = header_bh; get_bh(curr_bh); curr = 0; curr_blkoff = 0; + curr_list_offset = nilfs_cpfile_ch_snapshot_list_offset(); prev = le64_to_cpu(list->ssl_prev); while (prev > cno) { prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev); curr = prev; + kunmap_local(list); if (curr_blkoff != prev_blkoff) { - kunmap_local(kaddr); brelse(curr_bh); ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, 0, &curr_bh); - if (ret < 0) - goto out_header; - kaddr = kmap_local_page(curr_bh->b_page); + if (unlikely(ret < 0)) + goto out_cp; } + curr_list_offset = nilfs_cpfile_cp_snapshot_list_offset( + cpfile, curr, curr_bh); + list = kmap_local_folio(curr_bh->b_folio, curr_list_offset); curr_blkoff = prev_blkoff; - cp = nilfs_cpfile_block_get_checkpoint( - cpfile, curr, curr_bh, kaddr); - list = &cp->cp_snapshot_list; prev = le64_to_cpu(list->ssl_prev); } - kunmap_local(kaddr); + kunmap_local(list); if (prev != 0) { ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0, &prev_bh); if (ret < 0) goto out_curr; + + prev_list_offset = nilfs_cpfile_cp_snapshot_list_offset( + cpfile, prev, prev_bh); } else { prev_bh = header_bh; get_bh(prev_bh); + prev_list_offset = nilfs_cpfile_ch_snapshot_list_offset(); } - kaddr = kmap_local_page(curr_bh->b_page); - list = nilfs_cpfile_block_get_snapshot_list( - cpfile, curr, curr_bh, kaddr); + /* Update the list entry for the next snapshot */ + list = kmap_local_folio(curr_bh->b_folio, curr_list_offset); list->ssl_prev = cpu_to_le64(cno); - kunmap_local(kaddr); + kunmap_local(list); - kaddr = kmap_local_page(cp_bh->b_page); - cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + /* Update the checkpoint being changed to a snapshot */ + offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, cp_bh); + cp = kmap_local_folio(cp_bh->b_folio, offset); cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr); cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev); nilfs_checkpoint_set_snapshot(cp); - kunmap_local(kaddr); + kunmap_local(cp); - kaddr = kmap_local_page(prev_bh->b_page); - list = nilfs_cpfile_block_get_snapshot_list( - cpfile, prev, prev_bh, kaddr); + /* Update the list entry for the previous snapshot */ + list = kmap_local_folio(prev_bh->b_folio, prev_list_offset); list->ssl_next = cpu_to_le64(cno); - kunmap_local(kaddr); + kunmap_local(list); - kaddr = kmap_local_page(header_bh->b_page); - header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); + /* Update the statistics in the header */ + header = kmap_local_folio(header_bh->b_folio, 0); le64_add_cpu(&header->ch_nsnapshots, 1); - kunmap_local(kaddr); + kunmap_local(header); mark_buffer_dirty(prev_bh); mark_buffer_dirty(curr_bh); @@ -861,12 +883,12 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno) out_curr: brelse(curr_bh); - out_header: - brelse(header_bh); - out_cp: brelse(cp_bh); + out_header: + brelse(header_bh); + out_sem: up_write(&NILFS_MDT(cpfile)->mi_sem); return ret; @@ -879,79 +901,87 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno) struct nilfs_checkpoint *cp; struct nilfs_snapshot_list *list; __u64 next, prev; - void *kaddr; + size_t offset, next_list_offset, prev_list_offset; int ret; if (cno == 0) return -ENOENT; /* checkpoint number 0 is invalid */ down_write(&NILFS_MDT(cpfile)->mi_sem); + ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); + if (unlikely(ret < 0)) + goto out_sem; + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); if (ret < 0) - goto out_sem; - kaddr = kmap_local_page(cp_bh->b_page); - cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + goto out_header; + + offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, cp_bh); + cp = kmap_local_folio(cp_bh->b_folio, offset); if (nilfs_checkpoint_invalid(cp)) { ret = -ENOENT; - kunmap_local(kaddr); + kunmap_local(cp); goto out_cp; } if (!nilfs_checkpoint_snapshot(cp)) { ret = 0; - kunmap_local(kaddr); + kunmap_local(cp); goto out_cp; } list = &cp->cp_snapshot_list; next = le64_to_cpu(list->ssl_next); prev = le64_to_cpu(list->ssl_prev); - kunmap_local(kaddr); + kunmap_local(cp); - ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); - if (ret < 0) - goto out_cp; if (next != 0) { ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, 0, &next_bh); if (ret < 0) - goto out_header; + goto out_cp; + + next_list_offset = nilfs_cpfile_cp_snapshot_list_offset( + cpfile, next, next_bh); } else { next_bh = header_bh; get_bh(next_bh); + next_list_offset = nilfs_cpfile_ch_snapshot_list_offset(); } if (prev != 0) { ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0, &prev_bh); if (ret < 0) goto out_next; + + prev_list_offset = nilfs_cpfile_cp_snapshot_list_offset( + cpfile, prev, prev_bh); } else { prev_bh = header_bh; get_bh(prev_bh); + prev_list_offset = nilfs_cpfile_ch_snapshot_list_offset(); } - kaddr = kmap_local_page(next_bh->b_page); - list = nilfs_cpfile_block_get_snapshot_list( - cpfile, next, next_bh, kaddr); + /* Update the list entry for the next snapshot */ + list = kmap_local_folio(next_bh->b_folio, next_list_offset); list->ssl_prev = cpu_to_le64(prev); - kunmap_local(kaddr); + kunmap_local(list); - kaddr = kmap_local_page(prev_bh->b_page); - list = nilfs_cpfile_block_get_snapshot_list( - cpfile, prev, prev_bh, kaddr); + /* Update the list entry for the previous snapshot */ + list = kmap_local_folio(prev_bh->b_folio, prev_list_offset); list->ssl_next = cpu_to_le64(next); - kunmap_local(kaddr); + kunmap_local(list); - kaddr = kmap_local_page(cp_bh->b_page); - cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + /* Update the snapshot being changed back to a plain checkpoint */ + cp = kmap_local_folio(cp_bh->b_folio, offset); cp->cp_snapshot_list.ssl_next = cpu_to_le64(0); cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0); nilfs_checkpoint_clear_snapshot(cp); - kunmap_local(kaddr); + kunmap_local(cp); - kaddr = kmap_local_page(header_bh->b_page); - header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); + /* Update the statistics in the header */ + header = kmap_local_folio(header_bh->b_folio, 0); le64_add_cpu(&header->ch_nsnapshots, -1); - kunmap_local(kaddr); + kunmap_local(header); mark_buffer_dirty(next_bh); mark_buffer_dirty(prev_bh); @@ -964,12 +994,12 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno) out_next: brelse(next_bh); - out_header: - brelse(header_bh); - out_cp: brelse(cp_bh); + out_header: + brelse(header_bh); + out_sem: up_write(&NILFS_MDT(cpfile)->mi_sem); return ret; @@ -990,7 +1020,7 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) { struct buffer_head *bh; struct nilfs_checkpoint *cp; - void *kaddr; + size_t offset; int ret; /* @@ -1004,13 +1034,14 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh); if (ret < 0) goto out; - kaddr = kmap_local_page(bh->b_page); - cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); + + offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, bh); + cp = kmap_local_folio(bh->b_folio, offset); if (nilfs_checkpoint_invalid(cp)) ret = -ENOENT; else ret = nilfs_checkpoint_snapshot(cp); - kunmap_local(kaddr); + kunmap_local(cp); brelse(bh); out: @@ -1079,7 +1110,6 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat) { struct buffer_head *bh; struct nilfs_cpfile_header *header; - void *kaddr; int ret; down_read(&NILFS_MDT(cpfile)->mi_sem); @@ -1087,12 +1117,11 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat) ret = nilfs_cpfile_get_header_block(cpfile, &bh); if (ret < 0) goto out_sem; - kaddr = kmap_local_page(bh->b_page); - header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); + header = kmap_local_folio(bh->b_folio, 0); cpstat->cs_cno = nilfs_mdt_cno(cpfile); cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints); cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots); - kunmap_local(kaddr); + kunmap_local(header); brelse(bh); out_sem: From 310293201ed242e466b0e9f10f53b4a4abccffec Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 24 Oct 2024 18:25:43 +0900 Subject: [PATCH 53/71] nilfs2: remove nilfs_writepage Since nilfs2 has a ->writepages operation already, ->writepage is only called by the migration code. If we add a ->migrate_folio operation, it won't even be used for that and so it can be deleted. [konishi.ryusuke@gmail.com: fixed panic by using buffer_migrate_folio_norefs] Link: https://lkml.kernel.org/r/20241002150036.1339475-2-willy@infradead.org Link: https://lkml.kernel.org/r/20241024092602.13395-10-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/inode.c | 33 +-------------------------------- 1 file changed, 1 insertion(+), 32 deletions(-) diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index be6acf6e2bfc..c24f06268010 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -170,37 +170,6 @@ static int nilfs_writepages(struct address_space *mapping, return err; } -static int nilfs_writepage(struct page *page, struct writeback_control *wbc) -{ - struct folio *folio = page_folio(page); - struct inode *inode = folio->mapping->host; - int err; - - if (sb_rdonly(inode->i_sb)) { - /* - * It means that filesystem was remounted in read-only - * mode because of error or metadata corruption. But we - * have dirty pages that try to be flushed in background. - * So, here we simply discard this dirty page. - */ - nilfs_clear_folio_dirty(folio); - folio_unlock(folio); - return -EROFS; - } - - folio_redirty_for_writepage(wbc, folio); - folio_unlock(folio); - - if (wbc->sync_mode == WB_SYNC_ALL) { - err = nilfs_construct_segment(inode->i_sb); - if (unlikely(err)) - return err; - } else if (wbc->for_reclaim) - nilfs_flush_segment(inode->i_sb, inode->i_ino); - - return 0; -} - static bool nilfs_dirty_folio(struct address_space *mapping, struct folio *folio) { @@ -295,7 +264,6 @@ nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } const struct address_space_operations nilfs_aops = { - .writepage = nilfs_writepage, .read_folio = nilfs_read_folio, .writepages = nilfs_writepages, .dirty_folio = nilfs_dirty_folio, @@ -304,6 +272,7 @@ const struct address_space_operations nilfs_aops = { .write_end = nilfs_write_end, .invalidate_folio = block_invalidate_folio, .direct_IO = nilfs_direct_IO, + .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, }; From c1d73eb8d06003e7714cd3ce1d0d79832e59b1e9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 24 Oct 2024 18:25:44 +0900 Subject: [PATCH 54/71] nilfs2: convert nilfs_page_count_clean_buffers() to take a folio Both callers have a folio, so pass it in and use it directly. [konishi.ryusuke@gmail.com: fixed a checkpatch warning about function declaration] Link: https://lkml.kernel.org/r/20241002150036.1339475-3-willy@infradead.org Link: https://lkml.kernel.org/r/20241024092602.13395-11-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/dir.c | 2 +- fs/nilfs2/inode.c | 2 +- fs/nilfs2/page.c | 4 ++-- fs/nilfs2/page.h | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index a8602729586a..14e8d82f8629 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -95,7 +95,7 @@ static void nilfs_commit_chunk(struct folio *folio, unsigned int nr_dirty; int err; - nr_dirty = nilfs_page_count_clean_buffers(&folio->page, from, to); + nr_dirty = nilfs_page_count_clean_buffers(folio, from, to); copied = block_write_end(NULL, mapping, pos, len, len, folio, NULL); if (pos + copied > dir->i_size) i_size_write(dir, pos + copied); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index c24f06268010..cf9ba481ae37 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -242,7 +242,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping, unsigned int nr_dirty; int err; - nr_dirty = nilfs_page_count_clean_buffers(&folio->page, start, + nr_dirty = nilfs_page_count_clean_buffers(folio, start, start + copied); copied = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 10def4b55995..e48079ebe939 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -422,14 +422,14 @@ void nilfs_clear_folio_dirty(struct folio *folio) __nilfs_clear_folio_dirty(folio); } -unsigned int nilfs_page_count_clean_buffers(struct page *page, +unsigned int nilfs_page_count_clean_buffers(struct folio *folio, unsigned int from, unsigned int to) { unsigned int block_start, block_end; struct buffer_head *bh, *head; unsigned int nc = 0; - for (bh = head = page_buffers(page), block_start = 0; + for (bh = head = folio_buffers(folio), block_start = 0; bh != head || !block_start; block_start = block_end, bh = bh->b_this_page) { block_end = block_start + bh->b_size; diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index 64521a03a19e..136cd1c143c9 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -43,8 +43,8 @@ int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); void nilfs_copy_back_pages(struct address_space *, struct address_space *); void nilfs_clear_folio_dirty(struct folio *folio); void nilfs_clear_dirty_pages(struct address_space *mapping); -unsigned int nilfs_page_count_clean_buffers(struct page *, unsigned int, - unsigned int); +unsigned int nilfs_page_count_clean_buffers(struct folio *folio, + unsigned int from, unsigned int to); unsigned long nilfs_find_uncommitted_extent(struct inode *inode, sector_t start_blk, sector_t *blkoff); From b18d78dec38e0ccd06e968396388eadea1da1c4e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 24 Oct 2024 18:25:45 +0900 Subject: [PATCH 55/71] nilfs2: convert nilfs_recovery_copy_block() to take a folio Use memcpy_to_folio() instead of open-coding it, and use offset_in_folio() in case anybody wants to use nilfs2 on a device with large blocks. [konishi.ryusuke@gmail.com: added label name change] Link: https://lkml.kernel.org/r/20241002150036.1339475-4-willy@infradead.org Link: https://lkml.kernel.org/r/20241024092602.13395-12-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/recovery.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index 21d81097a89f..e43405bf521e 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -481,19 +481,16 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs, static int nilfs_recovery_copy_block(struct the_nilfs *nilfs, struct nilfs_recovery_block *rb, - loff_t pos, struct page *page) + loff_t pos, struct folio *folio) { struct buffer_head *bh_org; - size_t from = pos & ~PAGE_MASK; - void *kaddr; + size_t from = offset_in_folio(folio, pos); bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize); if (unlikely(!bh_org)) return -EIO; - kaddr = kmap_local_page(page); - memcpy(kaddr + from, bh_org->b_data, bh_org->b_size); - kunmap_local(kaddr); + memcpy_to_folio(folio, from, bh_org->b_data, bh_org->b_size); brelse(bh_org); return 0; } @@ -531,13 +528,13 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, goto failed_inode; } - err = nilfs_recovery_copy_block(nilfs, rb, pos, &folio->page); + err = nilfs_recovery_copy_block(nilfs, rb, pos, folio); if (unlikely(err)) - goto failed_page; + goto failed_folio; err = nilfs_set_file_dirty(inode, 1); if (unlikely(err)) - goto failed_page; + goto failed_folio; block_write_end(NULL, inode->i_mapping, pos, blocksize, blocksize, folio, NULL); @@ -548,7 +545,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, (*nr_salvaged_blocks)++; goto next; - failed_page: + failed_folio: folio_unlock(folio); folio_put(folio); From 013a07052a1a02d6d8bebf84ad3a8cb483292da7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 24 Oct 2024 18:25:46 +0900 Subject: [PATCH 56/71] nilfs2: convert metadata aops from writepage to writepages By implementing ->writepages instead of ->writepage, we remove a layer of indirect function calls from the writeback path and the last use of struct page in nilfs2. [konishi.ryusuke@gmail.com: fixed panic by using buffer_migrate_folio_norefs] Link: https://lkml.kernel.org/r/20241002150036.1339475-5-willy@infradead.org Link: https://lkml.kernel.org/r/20241024092602.13395-13-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/mdt.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index a4c1e00aaaac..432181cfb0b5 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -399,10 +399,9 @@ int nilfs_mdt_fetch_dirty(struct inode *inode) return test_bit(NILFS_I_DIRTY, &ii->i_state); } -static int -nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) +static int nilfs_mdt_write_folio(struct folio *folio, + struct writeback_control *wbc) { - struct folio *folio = page_folio(page); struct inode *inode = folio->mapping->host; struct super_block *sb; int err = 0; @@ -435,11 +434,23 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) return err; } +static int nilfs_mdt_writeback(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct folio *folio = NULL; + int error; + + while ((folio = writeback_iter(mapping, wbc, folio, &error))) + error = nilfs_mdt_write_folio(folio, wbc); + + return error; +} static const struct address_space_operations def_mdt_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, - .writepage = nilfs_mdt_write_page, + .writepages = nilfs_mdt_writeback, + .migrate_folio = buffer_migrate_folio_norefs, }; static const struct inode_operations def_mdt_iops; From 2f07b652384969f5d0b317e1daa5f2eb967bc73d Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Fri, 25 Oct 2024 19:43:19 -0400 Subject: [PATCH 57/71] checkpatch: always parse orig_commit in fixes tag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not require the presence of `$balanced_parens` to get the commit SHA; this allows a `Fixes: deadbeef` tag to get a correct suggestion rather than a suggestion containing a reference to HEAD. Given this patch: : From: Tamir Duberstein : Subject: Test patch : Date: Fri, 25 Oct 2024 19:30:51 -0400 : : This is a test patch. : : Fixes: bd17e036b495 : Signed-off-by: Tamir Duberstein : --- /dev/null : +++ b/new-file : @@ -0,0 +1 @@ : +Test. Before: WARNING: Please use correct Fixes: style 'Fixes: <12 chars of sha1> ("")' - ie: 'Fixes: c10a7d25e68f ("Test patch")' After: WARNING: Please use correct Fixes: style 'Fixes: <12 chars of sha1> ("<title line>")' - ie: 'Fixes: bd17e036b495 ("checkpatch: warn for non-standard fixes tag style")' The prior behavior incorrectly suggested the patch's own SHA and title line rather than the referenced commit's. This fixes that. Ironically this: Fixes: bd17e036b495 ("checkpatch: warn for non-standard fixes tag style") Signed-off-by: Tamir Duberstein <tamird@gmail.com> Cc: Andy Whitcroft <apw@canonical.com> Cc: Dwaipayan Ray <dwaipayanray1@gmail.com> Cc: Joe Perches <joe@perches.com> Cc: Louis Peens <louis.peens@corigine.com> Cc: Lukas Bulwahn <lukas.bulwahn@gmail.com> Cc: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se> Cc: Philippe Schenker <philippe.schenker@toradex.com> Cc: Simon Horman <horms@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- scripts/checkpatch.pl | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 4427572b2477..b03d526e4c45 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3209,36 +3209,31 @@ sub process { # Check Fixes: styles is correct if (!$in_header_lines && - $line =~ /^\s*fixes:?\s*(?:commit\s*)?[0-9a-f]{5,}\b/i) { - my $orig_commit = ""; - my $id = "0123456789ab"; - my $title = "commit title"; - my $tag_case = 1; - my $tag_space = 1; - my $id_length = 1; - my $id_case = 1; + $line =~ /^\s*(fixes:?)\s*(?:commit\s*)?([0-9a-f]{5,40})(?:\s*($balanced_parens))?/i) { + my $tag = $1; + my $orig_commit = $2; + my $title; my $title_has_quotes = 0; $fixes_tag = 1; - - if ($line =~ /(\s*fixes:?)\s+([0-9a-f]{5,})\s+($balanced_parens)/i) { - my $tag = $1; - $orig_commit = $2; - $title = $3; - - $tag_case = 0 if $tag eq "Fixes:"; - $tag_space = 0 if ($line =~ /^fixes:? [0-9a-f]{5,} ($balanced_parens)/i); - - $id_length = 0 if ($orig_commit =~ /^[0-9a-f]{12}$/i); - $id_case = 0 if ($orig_commit !~ /[A-F]/); - + if (defined $3) { # Always strip leading/trailing parens then double quotes if existing - $title = substr($title, 1, -1); + $title = substr($3, 1, -1); if ($title =~ /^".*"$/) { $title = substr($title, 1, -1); $title_has_quotes = 1; } + } else { + $title = "commit title" } + + my $tag_case = not ($tag eq "Fixes:"); + my $tag_space = not ($line =~ /^fixes:? [0-9a-f]{5,40} ($balanced_parens)/i); + + my $id_length = not ($orig_commit =~ /^[0-9a-f]{12}$/i); + my $id_case = not ($orig_commit !~ /[A-F]/); + + my $id = "0123456789ab"; my ($cid, $ctitle) = git_commit_info($orig_commit, $id, $title); From e01caa2b63c88c64ee90b83a92a584ec90feeda5 Mon Sep 17 00:00:00 2001 From: Sui Jingfeng <sui.jingfeng@linux.dev> Date: Tue, 29 Oct 2024 02:29:20 +0800 Subject: [PATCH 58/71] lib/scatterlist: use sg_phys() helper This shorten the length of code in horizential direction, therefore is easier to read. Link: https://lkml.kernel.org/r/20241028182920.1025819-1-sui.jingfeng@linux.dev Signed-off-by: Sui Jingfeng <sui.jingfeng@linux.dev> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- lib/scatterlist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 473b2646f71c..5bb6b8aff232 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -474,14 +474,14 @@ int sg_alloc_append_table_from_pages(struct sg_append_table *sgt_append, return -EOPNOTSUPP; if (sgt_append->prv) { - unsigned long next_pfn = (page_to_phys(sg_page(sgt_append->prv)) + - sgt_append->prv->offset + sgt_append->prv->length) / PAGE_SIZE; + unsigned long next_pfn; if (WARN_ON(offset)) return -EINVAL; /* Merge contiguous pages into the last SG */ prv_len = sgt_append->prv->length; + next_pfn = (sg_phys(sgt_append->prv) + prv_len) / PAGE_SIZE; if (page_to_pfn(pages[0]) == next_pfn) { last_pg = pfn_to_page(next_pfn - 1); while (n_pages && pages_are_mergeable(pages[0], last_pg)) { From b5e60497a4b7f0b295c4c59b202cf4703b27062b Mon Sep 17 00:00:00 2001 From: Andrew Kreimer <algonell@gmail.com> Date: Sun, 27 Oct 2024 15:35:18 +0200 Subject: [PATCH 59/71] ocfs2: cluster: fix a typo Fix a typo: panicing -> panicking. Via codespell. Link: https://lkml.kernel.org/r/20241027133540.22090-1-algonell@gmail.com Signed-off-by: Andrew Kreimer <algonell@gmail.com> Acked-by: Joseph Qi <joseph.qi@linux.alibaba.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Junxiao Bi <junxiao.bi@oracle.com> Cc: Changwei Ge <gechangwei@live.cn> Cc: Jun Piao <piaojun@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- fs/ocfs2/cluster/quorum.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 15d0ed9c13e5..8bf17231d7b7 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -60,7 +60,7 @@ static void o2quo_fence_self(void) switch (o2nm_single_cluster->cl_fence_method) { case O2NM_FENCE_PANIC: panic("*** ocfs2 is very sorry to be fencing this system by " - "panicing ***\n"); + "panicking ***\n"); break; default: WARN_ON(o2nm_single_cluster->cl_fence_method >= From 77e94b0496ef39b759f23b4ece8c434a4b5c2e47 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" <linux@treblig.org> Date: Tue, 22 Oct 2024 01:25:43 +0100 Subject: [PATCH 60/71] ocfs2: remove unused errmsg function and table dlm_errmsg() has been unused since 2010's commit 0016eedc4185 ("ocfs2_dlmfs: Use the stackglue.") Remove dlm_errmsg() and the message table it indexes. Link: https://lkml.kernel.org/r/20241022002543.302606-1-linux@treblig.org Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org> Acked-by: Joseph Qi <joseph.qi@linux.alibaba.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Junxiao Bi <junxiao.bi@oracle.com> Cc: Changwei Ge <gechangwei@live.cn> Cc: Jun Piao <piaojun@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- fs/ocfs2/dlm/dlmapi.h | 2 -- fs/ocfs2/dlm/dlmdebug.c | 53 ----------------------------------------- 2 files changed, 55 deletions(-) diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h index bae60ca2672a..847a52dcbe7d 100644 --- a/fs/ocfs2/dlm/dlmapi.h +++ b/fs/ocfs2/dlm/dlmapi.h @@ -62,8 +62,6 @@ enum dlm_status { DLM_MAXSTATS, /* 41: upper limit for return code validation */ }; -/* for pretty-printing dlm_status error messages */ -const char *dlm_errmsg(enum dlm_status err); /* for pretty-printing dlm_status error names */ const char *dlm_errname(enum dlm_status err); diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index be5e9ed7da8d..e9ef4e2b0e75 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -164,59 +164,6 @@ static const char *dlm_errnames[] = { [DLM_MAXSTATS] = "DLM_MAXSTATS", }; -static const char *dlm_errmsgs[] = { - [DLM_NORMAL] = "request in progress", - [DLM_GRANTED] = "request granted", - [DLM_DENIED] = "request denied", - [DLM_DENIED_NOLOCKS] = "request denied, out of system resources", - [DLM_WORKING] = "async request in progress", - [DLM_BLOCKED] = "lock request blocked", - [DLM_BLOCKED_ORPHAN] = "lock request blocked by a orphan lock", - [DLM_DENIED_GRACE_PERIOD] = "topological change in progress", - [DLM_SYSERR] = "system error", - [DLM_NOSUPPORT] = "unsupported", - [DLM_CANCELGRANT] = "can't cancel convert: already granted", - [DLM_IVLOCKID] = "bad lockid", - [DLM_SYNC] = "synchronous request granted", - [DLM_BADTYPE] = "bad resource type", - [DLM_BADRESOURCE] = "bad resource handle", - [DLM_MAXHANDLES] = "no more resource handles", - [DLM_NOCLINFO] = "can't contact cluster manager", - [DLM_NOLOCKMGR] = "can't contact lock manager", - [DLM_NOPURGED] = "can't contact purge daemon", - [DLM_BADARGS] = "bad api args", - [DLM_VOID] = "no status", - [DLM_NOTQUEUED] = "NOQUEUE was specified and request failed", - [DLM_IVBUFLEN] = "invalid resource name length", - [DLM_CVTUNGRANT] = "attempted to convert ungranted lock", - [DLM_BADPARAM] = "invalid lock mode specified", - [DLM_VALNOTVALID] = "value block has been invalidated", - [DLM_REJECTED] = "request rejected, unrecognized client", - [DLM_ABORT] = "blocked lock request cancelled", - [DLM_CANCEL] = "conversion request cancelled", - [DLM_IVRESHANDLE] = "invalid resource handle", - [DLM_DEADLOCK] = "deadlock recovery refused this request", - [DLM_DENIED_NOASTS] = "failed to allocate AST", - [DLM_FORWARD] = "request must wait for primary's response", - [DLM_TIMEOUT] = "timeout value for lock has expired", - [DLM_IVGROUPID] = "invalid group specification", - [DLM_VERS_CONFLICT] = "version conflicts prevent request handling", - [DLM_BAD_DEVICE_PATH] = "Locks device does not exist or path wrong", - [DLM_NO_DEVICE_PERMISSION] = "Client has insufficient perms for device", - [DLM_NO_CONTROL_DEVICE] = "Cannot set options on opened device ", - [DLM_RECOVERING] = "lock resource being recovered", - [DLM_MIGRATING] = "lock resource being migrated", - [DLM_MAXSTATS] = "invalid error number", -}; - -const char *dlm_errmsg(enum dlm_status err) -{ - if (err >= DLM_MAXSTATS || err < 0) - return dlm_errmsgs[DLM_MAXSTATS]; - return dlm_errmsgs[err]; -} -EXPORT_SYMBOL_GPL(dlm_errmsg); - const char *dlm_errname(enum dlm_status err) { if (err >= DLM_MAXSTATS || err < 0) From d7ce9c73da54a096311edbf4688b78b179dd79bc Mon Sep 17 00:00:00 2001 From: Huang Ying <ying.huang@intel.com> Date: Tue, 29 Oct 2024 20:27:35 +0800 Subject: [PATCH 61/71] resource: avoid unnecessary resource tree walking in __region_intersects() Currently, if __region_intersects() finds any overlapped but unmatched resource, it walks the descendant resource tree to check for overlapped and matched descendant resources using for_each_resource(). However, in current kernel, for_each_resource() iterates not only the descendant tree, but also subsequent sibling trees in certain scenarios. While this doesn't introduce bugs, it makes code hard to be understood and potentially inefficient. So, the patch revises next_resource() and for_each_resource() and makes for_each_resource() traverse the subtree under the specified subtree root only. Test shows that this avoids unnecessary resource tree walking in __region_intersects(). For the example resource tree as follows, X | A----D----E | B--C if 'A' is the overlapped but unmatched resource, original kernel iterates 'B', 'C', 'D', 'E' when it walks the descendant tree. While the patched kernel iterates only 'B', 'C'. Thanks David Hildenbrand for providing a good resource tree example. Link: https://lkml.kernel.org/r/20241029122735.79164-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" <ying.huang@intel.com> Acked-by: Dan Williams <dan.j.williams@intel.com> Cc: David Hildenbrand <david@redhat.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Jonathan Cameron <jonathan.cameron@huawei.com> Cc: Alistair Popple <apopple@nvidia.com> Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com> Cc: Bjorn Helgaas <bhelgaas@google.com> Cc: Baoquan He <bhe@redhat.com> Cc: Dave Jiang <dave.jiang@intel.com> Cc: Alison Schofield <alison.schofield@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- kernel/resource.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 2d4208b2f62f..59c6e608f1d1 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -50,17 +50,35 @@ EXPORT_SYMBOL(iomem_resource); static DEFINE_RWLOCK(resource_lock); -static struct resource *next_resource(struct resource *p, bool skip_children) +/* + * Return the next node of @p in pre-order tree traversal. If + * @skip_children is true, skip the descendant nodes of @p in + * traversal. If @p is a descendant of @subtree_root, only traverse + * the subtree under @subtree_root. + */ +static struct resource *next_resource(struct resource *p, bool skip_children, + struct resource *subtree_root) { if (!skip_children && p->child) return p->child; - while (!p->sibling && p->parent) + while (!p->sibling && p->parent) { p = p->parent; + if (p == subtree_root) + return NULL; + } return p->sibling; } +/* + * Traverse the resource subtree under @_root in pre-order, excluding + * @_root itself. + * + * NOTE: '__p' is introduced to avoid shadowing '_p' outside of loop. + * And it is referenced to avoid unused variable warning. + */ #define for_each_resource(_root, _p, _skip_children) \ - for ((_p) = (_root)->child; (_p); (_p) = next_resource(_p, _skip_children)) + for (typeof(_root) __root = (_root), __p = _p = __root->child; \ + __p && _p; _p = next_resource(_p, _skip_children, __root)) #ifdef CONFIG_PROC_FS @@ -88,7 +106,7 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos) (*pos)++; - return (void *)next_resource(p, false); + return (void *)next_resource(p, false, NULL); } static void r_stop(struct seq_file *m, void *v) From 82e33f249f1126cf3c5f39a31b850d485ac33bc3 Mon Sep 17 00:00:00 2001 From: Mirsad Todorovac <mtodorovac69@gmail.com> Date: Tue, 29 Oct 2024 06:46:52 +0100 Subject: [PATCH 62/71] fs/proc/kcore.c: fix coccinelle reported ERROR instances Coccinelle complains about the nested reuse of the pointer `iter' with different pointer type: ./fs/proc/kcore.c:515:26-30: ERROR: invalid reference to the index variable of the iterator on line 499 ./fs/proc/kcore.c:534:23-27: ERROR: invalid reference to the index variable of the iterator on line 499 ./fs/proc/kcore.c:550:40-44: ERROR: invalid reference to the index variable of the iterator on line 499 ./fs/proc/kcore.c:568:27-31: ERROR: invalid reference to the index variable of the iterator on line 499 ./fs/proc/kcore.c:581:28-32: ERROR: invalid reference to the index variable of the iterator on line 499 ./fs/proc/kcore.c:599:27-31: ERROR: invalid reference to the index variable of the iterator on line 499 ./fs/proc/kcore.c:607:38-42: ERROR: invalid reference to the index variable of the iterator on line 499 ./fs/proc/kcore.c:614:26-30: ERROR: invalid reference to the index variable of the iterator on line 499 Replacing `struct kcore_list *iter' with `struct kcore_list *tmp' doesn't change the scope and the functionality is the same and coccinelle seems happy. NOTE: There was an issue with using `struct kcore_list *pos' as the nested iterator. The build did not work! [akpm@linux-foundation.org: s/tmp/pos/] Link: https://lkml.kernel.org/r/20241029054651.86356-2-mtodorovac69@gmail.com Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ [1] Link: https://lkml.kernel.org/r/20220331223700.902556-1-jakobkoschel@gmail.com Fixes: 04d168c6d42d ("fs/proc/kcore.c: remove check of list iterator against head past the loop body") Signed-off-by: Jakob Koschel <jakobkoschel@gmail.com> Signed-off-by: Mirsad Todorovac <mtodorovac69@gmail.com> Cc: Mike Rapoport <rppt@kernel.org> Cc: David Hildenbrand <david@redhat.com> Cc: Oscar Salvador <osalvador@suse.de> Cc: "Brian Johannesmeyer" <bjohannesmeyer@gmail.com> Cc: Cristiano Giuffrida <c.giuffrida@vu.nl> Cc: "Bos, H.J." <h.j.bos@vu.nl> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Yang Li <yang.lee@linux.alibaba.com> Cc: Baoquan He <bhe@redhat.com> Cc: Hari Bathini <hbathini@linux.ibm.com> Cc: Yan Zhen <yanzhen@vivo.com> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- fs/proc/kcore.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 51446c59388f..7a85735d584f 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -493,13 +493,13 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) * the previous entry, search for a matching entry. */ if (!m || start < m->addr || start >= m->addr + m->size) { - struct kcore_list *iter; + struct kcore_list *pos; m = NULL; - list_for_each_entry(iter, &kclist_head, list) { - if (start >= iter->addr && - start < iter->addr + iter->size) { - m = iter; + list_for_each_entry(pos, &kclist_head, list) { + if (start >= pos->addr && + start < pos->addr + pos->size) { + m = pos; break; } } From 777620b890d783c6575f172041f390c4c075b666 Mon Sep 17 00:00:00 2001 From: Uros Bizjak <ubizjak@gmail.com> Date: Mon, 7 Oct 2024 10:37:52 +0200 Subject: [PATCH 63/71] dma-buf: use atomic64_inc_return() in dma_buf_getfile() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use atomic64_inc_return(&ref) instead of atomic64_add_return(1, &ref) to use optimized implementation and ease register pressure around the primitive for targets that implement optimized variant. Link: https://lkml.kernel.org/r/20241007083921.47525-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak <ubizjak@gmail.com> Cc: Sumit Semwal <sumit.semwal@linaro.org> Cc: "Christian König" <christian.koenig@amd.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- drivers/dma-buf/dma-buf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 8892bc701a66..a3649db76add 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -558,7 +558,7 @@ static struct file *dma_buf_getfile(size_t size, int flags) * Override ->i_ino with the unique and dmabuffs specific * value. */ - inode->i_ino = atomic64_add_return(1, &dmabuf_inode); + inode->i_ino = atomic64_inc_return(&dmabuf_inode); flags &= O_ACCMODE | O_NONBLOCK; file = alloc_file_pseudo(inode, dma_buf_mnt, "dmabuf", flags, &dma_buf_fops); From 03ecb24db20e78c478b9b7c0ec767bfdc053ecd4 Mon Sep 17 00:00:00 2001 From: Lance Yang <ioworker0@gmail.com> Date: Sun, 27 Oct 2024 20:07:46 +0800 Subject: [PATCH 64/71] hung_task: add detect count for hung tasks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "add detect count for hung tasks", v2. This patchset adds a counter, hung_task_detect_count, to track the number of times hung tasks are detected. IHMO, hung tasks are a critical metric. Currently, we detect them by periodically parsing dmesg. However, this method isn't as user-friendly as using a counter. Sometimes, a short-lived issue with NIC or hard drive can quickly decrease the hung_task_warnings to zero. Without warnings, we must directly access the node to ensure that there are no more hung tasks and that the system has recovered. After all, load average alone cannot provide a clear picture. Once this counter is in place, in a high-density deployment pattern, we plan to set hung_task_timeout_secs to a lower number to improve stability, even though this might result in false positives. And then we can set a time-based threshold: if hung tasks last beyond this duration, we will automatically migrate containers to other nodes. Based on past experience, this approach could help avoid many production disruptions. Moreover, just like other important events such as OOM that already have counters, having a dedicated counter for hung tasks makes sense ;) This patch (of 2): This commit adds a counter, hung_task_detect_count, to track the number of times hung tasks are detected. IHMO, hung tasks are a critical metric. Currently, we detect them by periodically parsing dmesg. However, this method isn't as user-friendly as using a counter. Sometimes, a short-lived issue with NIC or hard drive can quickly decrease the hung_task_warnings to zero. Without warnings, we must directly access the node to ensure that there are no more hung tasks and that the system has recovered. After all, load average alone cannot provide a clear picture. Once this counter is in place, in a high-density deployment pattern, we plan to set hung_task_timeout_secs to a lower number to improve stability, even though this might result in false positives. And then we can set a time-based threshold: if hung tasks last beyond this duration, we will automatically migrate containers to other nodes. Based on past experience, this approach could help avoid many production disruptions. Moreover, just like other important events such as OOM that already have counters, having a dedicated counter for hung tasks makes sense. [ioworker0@gmail.com: proc_doulongvec_minmax instead of proc_dointvec] Link: https://lkml.kernel.org/r/20241101114833.8377-1-ioworker0@gmail.com Link: https://lkml.kernel.org/r/20241027120747.42833-1-ioworker0@gmail.com Link: https://lkml.kernel.org/r/20241027120747.42833-2-ioworker0@gmail.com Signed-off-by: Mingzhe Yang <mingzhe.yang@ly.com> Signed-off-by: Lance Yang <ioworker0@gmail.com> Cc: Bang Li <libang.li@antgroup.com> Cc: Baolin Wang <baolin.wang@linux.alibaba.com> Cc: David Hildenbrand <david@redhat.com> Cc: Huang Cun <cunhuang@tencent.com> Cc: Joel Granados <j.granados@samsung.com> Cc: Joel Granados <joel.granados@kernel.org> Cc: John Siddle <jsiddle@redhat.com> Cc: Kent Overstreet <kent.overstreet@linux.dev> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: Thomas Weißschuh <linux@weissschuh.net> Cc: Yongliang Gao <leonylgao@tencent.com> Cc: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- kernel/hung_task.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 959d99583d1c..c18717189f32 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -30,6 +30,11 @@ */ static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; +/* + * Total number of tasks detected as hung since boot: + */ +static unsigned long __read_mostly sysctl_hung_task_detect_count; + /* * Limit number of tasks checked in a batch. * @@ -115,6 +120,12 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) return; + /* + * This counter tracks the total number of tasks detected as hung + * since boot. + */ + sysctl_hung_task_detect_count++; + trace_sched_process_hang(t); if (sysctl_hung_task_panic) { @@ -314,6 +325,13 @@ static struct ctl_table hung_task_sysctls[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_NEG_ONE, }, + { + .procname = "hung_task_detect_count", + .data = &sysctl_hung_task_detect_count, + .maxlen = sizeof(unsigned long), + .mode = 0444, + .proc_handler = proc_doulongvec_minmax, + }, }; static void __init hung_task_sysctl_init(void) From 62bf7065cc6056a51a240c810b95d887e5bb7c8c Mon Sep 17 00:00:00 2001 From: Lance Yang <ioworker0@gmail.com> Date: Sun, 27 Oct 2024 20:07:47 +0800 Subject: [PATCH 65/71] hung_task: add docs for hung_task_detect_count MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit introduces documentation for hung_task_detect_count in kernel.rst. Link: https://lkml.kernel.org/r/20241027120747.42833-3-ioworker0@gmail.com Signed-off-by: Mingzhe Yang <mingzhe.yang@ly.com> Signed-off-by: Lance Yang <ioworker0@gmail.com> Cc: Bang Li <libang.li@antgroup.com> Cc: Baolin Wang <baolin.wang@linux.alibaba.com> Cc: David Hildenbrand <david@redhat.com> Cc: Huang Cun <cunhuang@tencent.com> Cc: Joel Granados <j.granados@samsung.com> Cc: Joel Granados <joel.granados@kernel.org> Cc: John Siddle <jsiddle@redhat.com> Cc: Kent Overstreet <kent.overstreet@linux.dev> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: Thomas Weißschuh <linux@weissschuh.net> Cc: Yongliang Gao <leonylgao@tencent.com> Cc: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- Documentation/admin-guide/sysctl/kernel.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index f8bc1630eba0..b2b36d0c3094 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -401,6 +401,15 @@ The upper bound on the number of tasks that are checked. This file shows up if ``CONFIG_DETECT_HUNG_TASK`` is enabled. +hung_task_detect_count +====================== + +Indicates the total number of tasks that have been detected as hung since +the system boot. + +This file shows up if ``CONFIG_DETECT_HUNG_TASK`` is enabled. + + hung_task_timeout_secs ====================== From adc77b19f62d7e80f98400b2fca9d700d2afdd6f Mon Sep 17 00:00:00 2001 From: Dmitry Antipov <dmantipov@yandex.ru> Date: Tue, 29 Oct 2024 12:17:36 +0300 Subject: [PATCH 66/71] ocfs2: fix uninitialized value in ocfs2_file_read_iter() Syzbot has reported the following KMSAN splat: BUG: KMSAN: uninit-value in ocfs2_file_read_iter+0x9a4/0xf80 ocfs2_file_read_iter+0x9a4/0xf80 __io_read+0x8d4/0x20f0 io_read+0x3e/0xf0 io_issue_sqe+0x42b/0x22c0 io_wq_submit_work+0xaf9/0xdc0 io_worker_handle_work+0xd13/0x2110 io_wq_worker+0x447/0x1410 ret_from_fork+0x6f/0x90 ret_from_fork_asm+0x1a/0x30 Uninit was created at: __alloc_pages_noprof+0x9a7/0xe00 alloc_pages_mpol_noprof+0x299/0x990 alloc_pages_noprof+0x1bf/0x1e0 allocate_slab+0x33a/0x1250 ___slab_alloc+0x12ef/0x35e0 kmem_cache_alloc_bulk_noprof+0x486/0x1330 __io_alloc_req_refill+0x84/0x560 io_submit_sqes+0x172f/0x2f30 __se_sys_io_uring_enter+0x406/0x41c0 __x64_sys_io_uring_enter+0x11f/0x1a0 x64_sys_call+0x2b54/0x3ba0 do_syscall_64+0xcd/0x1e0 entry_SYSCALL_64_after_hwframe+0x77/0x7f Since an instance of 'struct kiocb' may be passed from the block layer with 'private' field uninitialized, introduce 'ocfs2_iocb_init_rw_locked()' and use it from where 'ocfs2_dio_end_io()' might take care, i.e. in 'ocfs2_file_read_iter()' and 'ocfs2_file_write_iter()'. Link: https://lkml.kernel.org/r/20241029091736.1501946-1-dmantipov@yandex.ru Fixes: 7cdfc3a1c397 ("ocfs2: Remember rw lock level during direct io") Signed-off-by: Dmitry Antipov <dmantipov@yandex.ru> Reported-by: syzbot+a73e253cca4f0230a5a5@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=a73e253cca4f0230a5a5 Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Junxiao Bi <junxiao.bi@oracle.com> Cc: Joseph Qi <jiangqi903@gmail.com> Cc: Changwei Ge <gechangwei@live.cn> Cc: Jun Piao <piaojun@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- fs/ocfs2/aops.h | 2 ++ fs/ocfs2/file.c | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 45db1781ea73..1d1b4b7edba0 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -70,6 +70,8 @@ enum ocfs2_iocb_lock_bits { OCFS2_IOCB_NUM_LOCKS }; +#define ocfs2_iocb_init_rw_locked(iocb) \ + (iocb->private = NULL) #define ocfs2_iocb_clear_rw_locked(iocb) \ clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private) #define ocfs2_iocb_rw_locked_level(iocb) \ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 06af21982c16..cb09330a0861 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2398,6 +2398,8 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, } else inode_lock(inode); + ocfs2_iocb_init_rw_locked(iocb); + /* * Concurrent O_DIRECT writes are allowed with * mount_option "coherency=buffered". @@ -2544,6 +2546,8 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, if (!direct_io && nowait) return -EOPNOTSUPP; + ocfs2_iocb_init_rw_locked(iocb); + /* * buffered reads protect themselves in ->read_folio(). O_DIRECT reads * need locks to protect pending reads from racing with truncate. From a7306f3c283bfe03611229bb6280987aae2af8f9 Mon Sep 17 00:00:00 2001 From: Nataniel Farzan <natanielfarzan@gmail.com> Date: Mon, 4 Nov 2024 19:22:32 -0800 Subject: [PATCH 67/71] Improve consistency of '#error' directive messages Remove the use of contractions and use proper punctuation in #error directive messages that discourage the direct inclusion of header files. Link: https://lkml.kernel.org/r/20241105032231.28833-1-natanielfarzan@gmail.com Signed-off-by: Nataniel Farzan <natanielfarzan@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- arch/alpha/include/asm/spinlock_types.h | 2 +- arch/arm/include/asm/spinlock_types.h | 2 +- arch/arm64/include/asm/spinlock_types.h | 2 +- arch/hexagon/include/asm/spinlock_types.h | 2 +- arch/powerpc/include/asm/simple_spinlock_types.h | 2 +- arch/powerpc/include/asm/spinlock_types.h | 2 +- arch/s390/include/asm/spinlock_types.h | 2 +- arch/sh/include/asm/spinlock_types.h | 2 +- arch/xtensa/include/asm/spinlock_types.h | 2 +- include/acpi/platform/aclinux.h | 2 +- include/linux/compiler-clang.h | 2 +- include/linux/compiler-gcc.h | 2 +- include/linux/pm_wakeup.h | 2 +- include/linux/rwlock.h | 2 +- include/linux/rwlock_api_smp.h | 2 +- include/linux/spinlock_api_smp.h | 2 +- include/linux/spinlock_types_up.h | 2 +- include/linux/spinlock_up.h | 2 +- tools/include/linux/compiler-gcc.h | 2 +- 19 files changed, 19 insertions(+), 19 deletions(-) diff --git a/arch/alpha/include/asm/spinlock_types.h b/arch/alpha/include/asm/spinlock_types.h index 2526fd3be5fd..05a444d77c53 100644 --- a/arch/alpha/include/asm/spinlock_types.h +++ b/arch/alpha/include/asm/spinlock_types.h @@ -3,7 +3,7 @@ #define _ALPHA_SPINLOCK_TYPES_H #ifndef __LINUX_SPINLOCK_TYPES_RAW_H -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif typedef struct { diff --git a/arch/arm/include/asm/spinlock_types.h b/arch/arm/include/asm/spinlock_types.h index 0c14b36ef101..5404a2a96bf3 100644 --- a/arch/arm/include/asm/spinlock_types.h +++ b/arch/arm/include/asm/spinlock_types.h @@ -3,7 +3,7 @@ #define __ASM_SPINLOCK_TYPES_H #ifndef __LINUX_SPINLOCK_TYPES_RAW_H -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif #define TICKET_SHIFT 16 diff --git a/arch/arm64/include/asm/spinlock_types.h b/arch/arm64/include/asm/spinlock_types.h index 11ab1c077697..7cde3d8bd0ad 100644 --- a/arch/arm64/include/asm/spinlock_types.h +++ b/arch/arm64/include/asm/spinlock_types.h @@ -6,7 +6,7 @@ #define __ASM_SPINLOCK_TYPES_H #if !defined(__LINUX_SPINLOCK_TYPES_RAW_H) && !defined(__ASM_SPINLOCK_H) -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif #include <asm-generic/qspinlock_types.h> diff --git a/arch/hexagon/include/asm/spinlock_types.h b/arch/hexagon/include/asm/spinlock_types.h index d5f66495b670..63add2d863e8 100644 --- a/arch/hexagon/include/asm/spinlock_types.h +++ b/arch/hexagon/include/asm/spinlock_types.h @@ -9,7 +9,7 @@ #define _ASM_SPINLOCK_TYPES_H #ifndef __LINUX_SPINLOCK_TYPES_RAW_H -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif typedef struct { diff --git a/arch/powerpc/include/asm/simple_spinlock_types.h b/arch/powerpc/include/asm/simple_spinlock_types.h index 08243338069d..391fc19f7272 100644 --- a/arch/powerpc/include/asm/simple_spinlock_types.h +++ b/arch/powerpc/include/asm/simple_spinlock_types.h @@ -3,7 +3,7 @@ #define _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H #ifndef __LINUX_SPINLOCK_TYPES_RAW_H -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif typedef struct { diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h index 40b01446cf75..569765fa16bc 100644 --- a/arch/powerpc/include/asm/spinlock_types.h +++ b/arch/powerpc/include/asm/spinlock_types.h @@ -3,7 +3,7 @@ #define _ASM_POWERPC_SPINLOCK_TYPES_H #ifndef __LINUX_SPINLOCK_TYPES_RAW_H -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif #ifdef CONFIG_PPC_QUEUED_SPINLOCKS diff --git a/arch/s390/include/asm/spinlock_types.h b/arch/s390/include/asm/spinlock_types.h index b69695e39957..3653ff57d6d9 100644 --- a/arch/s390/include/asm/spinlock_types.h +++ b/arch/s390/include/asm/spinlock_types.h @@ -3,7 +3,7 @@ #define __ASM_SPINLOCK_TYPES_H #ifndef __LINUX_SPINLOCK_TYPES_RAW_H -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif typedef struct { diff --git a/arch/sh/include/asm/spinlock_types.h b/arch/sh/include/asm/spinlock_types.h index 907bda4b1619..7cb50e68448f 100644 --- a/arch/sh/include/asm/spinlock_types.h +++ b/arch/sh/include/asm/spinlock_types.h @@ -3,7 +3,7 @@ #define __ASM_SH_SPINLOCK_TYPES_H #ifndef __LINUX_SPINLOCK_TYPES_RAW_H -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif typedef struct { diff --git a/arch/xtensa/include/asm/spinlock_types.h b/arch/xtensa/include/asm/spinlock_types.h index 797aed7df3dd..6baaeac6248b 100644 --- a/arch/xtensa/include/asm/spinlock_types.h +++ b/arch/xtensa/include/asm/spinlock_types.h @@ -3,7 +3,7 @@ #define __ASM_SPINLOCK_TYPES_H #if !defined(__LINUX_SPINLOCK_TYPES_RAW_H) && !defined(__ASM_SPINLOCK_H) -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif #include <asm-generic/qspinlock_types.h> diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h index 565341c826e3..f3249b7df5cb 100644 --- a/include/acpi/platform/aclinux.h +++ b/include/acpi/platform/aclinux.h @@ -15,7 +15,7 @@ /* ACPICA external files should not include ACPICA headers directly. */ #if !defined(BUILDING_ACPICA) && !defined(_LINUX_ACPI_H) -#error "Please don't include <acpi/acpi.h> directly, include <linux/acpi.h> instead." +#error "Please do not include <acpi/acpi.h> directly, include <linux/acpi.h> instead." #endif #endif diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 4c1a39dcb624..2e7c2c282f3a 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_COMPILER_TYPES_H -#error "Please don't include <linux/compiler-clang.h> directly, include <linux/compiler.h> instead." +#error "Please do not include <linux/compiler-clang.h> directly, include <linux/compiler.h> instead." #endif /* Compiler specific definitions for Clang compiler */ diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index cd6f9aae311f..d0ed9583743f 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_COMPILER_TYPES_H -#error "Please don't include <linux/compiler-gcc.h> directly, include <linux/compiler.h> instead." +#error "Please do not include <linux/compiler-gcc.h> directly, include <linux/compiler.h> instead." #endif /* diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h index 76cd1f9f1365..222f7530806c 100644 --- a/include/linux/pm_wakeup.h +++ b/include/linux/pm_wakeup.h @@ -10,7 +10,7 @@ #define _LINUX_PM_WAKEUP_H #ifndef _DEVICE_H_ -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif #include <linux/types.h> diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index c0ef596f340b..5b87c6f4a243 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -2,7 +2,7 @@ #define __LINUX_RWLOCK_H #ifndef __LINUX_INSIDE_SPINLOCK_H -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif /* diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h index dceb0a59b692..31d3d1116323 100644 --- a/include/linux/rwlock_api_smp.h +++ b/include/linux/rwlock_api_smp.h @@ -2,7 +2,7 @@ #define __LINUX_RWLOCK_API_SMP_H #ifndef __LINUX_SPINLOCK_API_SMP_H -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif /* diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index 89eb6f4c659c..9ecb0ab504e3 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h @@ -2,7 +2,7 @@ #define __LINUX_SPINLOCK_API_SMP_H #ifndef __LINUX_INSIDE_SPINLOCK_H -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif /* diff --git a/include/linux/spinlock_types_up.h b/include/linux/spinlock_types_up.h index 7f86a2016ac5..fc4e2d017c20 100644 --- a/include/linux/spinlock_types_up.h +++ b/include/linux/spinlock_types_up.h @@ -2,7 +2,7 @@ #define __LINUX_SPINLOCK_TYPES_UP_H #ifndef __LINUX_SPINLOCK_TYPES_RAW_H -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif /* diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h index c87204247592..1e84e71ca495 100644 --- a/include/linux/spinlock_up.h +++ b/include/linux/spinlock_up.h @@ -2,7 +2,7 @@ #define __LINUX_SPINLOCK_UP_H #ifndef __LINUX_INSIDE_SPINLOCK_H -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif #include <asm/processor.h> /* for cpu_relax() */ diff --git a/tools/include/linux/compiler-gcc.h b/tools/include/linux/compiler-gcc.h index 62e7c901ac28..e20f98e14e81 100644 --- a/tools/include/linux/compiler-gcc.h +++ b/tools/include/linux/compiler-gcc.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _TOOLS_LINUX_COMPILER_H_ -#error "Please don't include <linux/compiler-gcc.h> directly, include <linux/compiler.h> instead." +#error "Please do not include <linux/compiler-gcc.h> directly, include <linux/compiler.h> instead." #endif /* From bc73b4186736341ab5cd2c199da82db6e1134e13 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean <aardelean@baylibre.com> Date: Tue, 5 Nov 2024 16:54:05 +0200 Subject: [PATCH 68/71] util_macros.h: fix/rework find_closest() macros A bug was found in the find_closest() (find_closest_descending() is also affected after some testing), where for certain values with small progressions, the rounding (done by averaging 2 values) causes an incorrect index to be returned. The rounding issues occur for progressions of 1, 2 and 3. It goes away when the progression/interval between two values is 4 or larger. It's particularly bad for progressions of 1. For example if there's an array of 'a = { 1, 2, 3 }', using 'find_closest(2, a ...)' would return 0 (the index of '1'), rather than returning 1 (the index of '2'). This means that for exact values (with a progression of 1), find_closest() will misbehave and return the index of the value smaller than the one we're searching for. For progressions of 2 and 3, the exact values are obtained correctly; but values aren't approximated correctly (as one would expect). Starting with progressions of 4, all seems to be good (one gets what one would expect). While one could argue that 'find_closest()' should not be used for arrays with progressions of 1 (i.e. '{1, 2, 3, ...}', the macro should still behave correctly. The bug was found while testing the 'drivers/iio/adc/ad7606.c', specifically the oversampling feature. For reference, the oversampling values are listed as: static const unsigned int ad7606_oversampling_avail[7] = { 1, 2, 4, 8, 16, 32, 64, }; When doing: 1. $ echo 1 > /sys/bus/iio/devices/iio\:device0/oversampling_ratio $ cat /sys/bus/iio/devices/iio\:device0/oversampling_ratio 1 # this is fine 2. $ echo 2 > /sys/bus/iio/devices/iio\:device0/oversampling_ratio $ cat /sys/bus/iio/devices/iio\:device0/oversampling_ratio 1 # this is wrong; 2 should be returned here 3. $ echo 3 > /sys/bus/iio/devices/iio\:device0/oversampling_ratio $ cat /sys/bus/iio/devices/iio\:device0/oversampling_ratio 2 # this is fine 4. $ echo 4 > /sys/bus/iio/devices/iio\:device0/oversampling_ratio $ cat /sys/bus/iio/devices/iio\:device0/oversampling_ratio 4 # this is fine And from here-on, the values are as correct (one gets what one would expect.) While writing a kunit test for this bug, a peculiar issue was found for the array in the 'drivers/hwmon/ina2xx.c' & 'drivers/iio/adc/ina2xx-adc.c' drivers. While running the kunit test (for 'ina226_avg_tab' from these drivers): * idx = find_closest([-1 to 2], ina226_avg_tab, ARRAY_SIZE(ina226_avg_tab)); This returns idx == 0, so value. * idx = find_closest(3, ina226_avg_tab, ARRAY_SIZE(ina226_avg_tab)); This returns idx == 0, value 1; and now one could argue whether 3 is closer to 4 or to 1. This quirk only appears for value '3' in this array, but it seems to be a another rounding issue. * And from 4 onwards the 'find_closest'() works fine (one gets what one would expect). This change reworks the find_closest() macros to also check the difference between the left and right elements when 'x'. If the distance to the right is smaller (than the distance to the left), the index is incremented by 1. This also makes redundant the need for using the DIV_ROUND_CLOSEST() macro. In order to accommodate for any mix of negative + positive values, the internal variables '__fc_x', '__fc_mid_x', '__fc_left' & '__fc_right' are forced to 'long' type. This also addresses any potential bugs/issues with 'x' being of an unsigned type. In those situations any comparison between signed & unsigned would be promoted to a comparison between 2 unsigned numbers; this is especially annoying when '__fc_left' & '__fc_right' underflow. The find_closest_descending() macro was also reworked and duplicated from the find_closest(), and it is being iterated in reverse. The main reason for this is to get the same indices as 'find_closest()' (but in reverse). The comparison for '__fc_right < __fc_left' favors going the array in ascending order. For example for array '{ 1024, 512, 256, 128, 64, 16, 4, 1 }' and x = 3, we get: __fc_mid_x = 2 __fc_left = -1 __fc_right = -2 Then '__fc_right < __fc_left' evaluates to true and '__fc_i++' becomes 7 which is not quite incorrect, but 3 is closer to 4 than to 1. This change has been validated with the kunit from the next patch. Link: https://lkml.kernel.org/r/20241105145406.554365-1-aardelean@baylibre.com Fixes: 95d119528b0b ("util_macros.h: add find_closest() macro") Signed-off-by: Alexandru Ardelean <aardelean@baylibre.com> Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- include/linux/util_macros.h | 56 ++++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/include/linux/util_macros.h b/include/linux/util_macros.h index 6bb460c3e818..825487fb66fa 100644 --- a/include/linux/util_macros.h +++ b/include/linux/util_macros.h @@ -4,19 +4,6 @@ #include <linux/math.h> -#define __find_closest(x, a, as, op) \ -({ \ - typeof(as) __fc_i, __fc_as = (as) - 1; \ - typeof(x) __fc_x = (x); \ - typeof(*a) const *__fc_a = (a); \ - for (__fc_i = 0; __fc_i < __fc_as; __fc_i++) { \ - if (__fc_x op DIV_ROUND_CLOSEST(__fc_a[__fc_i] + \ - __fc_a[__fc_i + 1], 2)) \ - break; \ - } \ - (__fc_i); \ -}) - /** * find_closest - locate the closest element in a sorted array * @x: The reference value. @@ -25,8 +12,27 @@ * @as: Size of 'a'. * * Returns the index of the element closest to 'x'. + * Note: If using an array of negative numbers (or mixed positive numbers), + * then be sure that 'x' is of a signed-type to get good results. */ -#define find_closest(x, a, as) __find_closest(x, a, as, <=) +#define find_closest(x, a, as) \ +({ \ + typeof(as) __fc_i, __fc_as = (as) - 1; \ + long __fc_mid_x, __fc_x = (x); \ + long __fc_left, __fc_right; \ + typeof(*a) const *__fc_a = (a); \ + for (__fc_i = 0; __fc_i < __fc_as; __fc_i++) { \ + __fc_mid_x = (__fc_a[__fc_i] + __fc_a[__fc_i + 1]) / 2; \ + if (__fc_x <= __fc_mid_x) { \ + __fc_left = __fc_x - __fc_a[__fc_i]; \ + __fc_right = __fc_a[__fc_i + 1] - __fc_x; \ + if (__fc_right < __fc_left) \ + __fc_i++; \ + break; \ + } \ + } \ + (__fc_i); \ +}) /** * find_closest_descending - locate the closest element in a sorted array @@ -36,9 +42,27 @@ * @as: Size of 'a'. * * Similar to find_closest() but 'a' is expected to be sorted in descending - * order. + * order. The iteration is done in reverse order, so that the comparison + * of '__fc_right' & '__fc_left' also works for unsigned numbers. */ -#define find_closest_descending(x, a, as) __find_closest(x, a, as, >=) +#define find_closest_descending(x, a, as) \ +({ \ + typeof(as) __fc_i, __fc_as = (as) - 1; \ + long __fc_mid_x, __fc_x = (x); \ + long __fc_left, __fc_right; \ + typeof(*a) const *__fc_a = (a); \ + for (__fc_i = __fc_as; __fc_i >= 1; __fc_i--) { \ + __fc_mid_x = (__fc_a[__fc_i] + __fc_a[__fc_i - 1]) / 2; \ + if (__fc_x <= __fc_mid_x) { \ + __fc_left = __fc_x - __fc_a[__fc_i]; \ + __fc_right = __fc_a[__fc_i - 1] - __fc_x; \ + if (__fc_right < __fc_left) \ + __fc_i--; \ + break; \ + } \ + } \ + (__fc_i); \ +}) /** * is_insidevar - check if the @ptr points inside the @var memory range. From 111314157f7891da7a51a8f95df42eeb22f4268a Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean <aardelean@baylibre.com> Date: Tue, 5 Nov 2024 16:54:06 +0200 Subject: [PATCH 69/71] lib: util_macros_kunit: add kunit test for util_macros.h A bug was found in the find_closest() (find_closest_descending() is also affected after some testing), where for certain values with small progressions of 1, 2 & 3, the rounding (done by averaging 2 values) causes an incorrect index to be returned. The bug is described in more detail in the commit which fixes the bug. This commit adds a kunit test to validate that the fix works correctly. This kunit test adds some of the arrays (from the driver-sphere) that seem to produce issues with the 'find_closest()' macro. Specifically the one from ad7606 driver (with which the bug was found) and from the ina2xx drivers, which shows the quirk with 'find_closest()' with elements in a array that have an interval of 3. For the find_closest_descending() tests, the same arrays are used as for the find_closest(), but in reverse; the idea is that 'find_closest_descending()' should return the sames indices as 'find_closest()' but in reverse. For testing both macros, there are 4 special arrays created, one for testing find_closest{_descending}() for arrays of progressions 1, 2, 3 and 4. The idea is to show that (for progressions of 1, 2 & 3) the fix works as expected. When removing the fix, the issues should start to show up. Then an extra array of negative and positive values is added. There are currently no such arrays within drivers, but one could expect that these macros behave correctly even for such arrays. To run this kunit: ./tools/testing/kunit/kunit.py run "*util_macros*" Link: https://lkml.kernel.org/r/20241105145406.554365-2-aardelean@baylibre.com Signed-off-by: Alexandru Ardelean <aardelean@baylibre.com> Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- lib/Kconfig.debug | 17 +++ lib/Makefile | 1 + lib/util_macros_kunit.c | 240 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 258 insertions(+) create mode 100644 lib/util_macros_kunit.c diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2549b64b2280..d7dd38f14333 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2630,6 +2630,23 @@ config CHECKSUM_KUNIT If unsure, say N. +config UTIL_MACROS_KUNIT + tristate "KUnit test util_macros.h functions at runtime" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + Enable this option to test the util_macros.h function at boot. + + KUnit tests run during boot and output the results to the debug log + in TAP format (http://testanything.org/). Only useful for kernel devs + running the KUnit test harness, and not intended for inclusion into a + production build. + + For more information on KUnit and unit tests in general please refer + to the KUnit documentation in Documentation/dev-tools/kunit/. + + If unsure, say N. + config HASH_KUNIT_TEST tristate "KUnit Test for integer hash functions" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/lib/Makefile b/lib/Makefile index 1eb89962daef..cc26f81722a5 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -372,6 +372,7 @@ obj-$(CONFIG_PLDMFW) += pldmfw/ CFLAGS_bitfield_kunit.o := $(DISABLE_STRUCTLEAK_PLUGIN) obj-$(CONFIG_BITFIELD_KUNIT) += bitfield_kunit.o obj-$(CONFIG_CHECKSUM_KUNIT) += checksum_kunit.o +obj-$(CONFIG_UTIL_MACROS_KUNIT) += util_macros_kunit.o obj-$(CONFIG_LIST_KUNIT_TEST) += list-test.o obj-$(CONFIG_HASHTABLE_KUNIT_TEST) += hashtable_test.o obj-$(CONFIG_LINEAR_RANGES_TEST) += test_linear_ranges.o diff --git a/lib/util_macros_kunit.c b/lib/util_macros_kunit.c new file mode 100644 index 000000000000..94cc9f0de50a --- /dev/null +++ b/lib/util_macros_kunit.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Test cases for bitfield helpers. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <kunit/test.h> +#include <linux/util_macros.h> + +#define FIND_CLOSEST_RANGE_CHECK(from, to, array, exp_idx) \ +{ \ + int i; \ + for (i = from; i <= to; i++) { \ + int found = find_closest(i, array, ARRAY_SIZE(array)); \ + KUNIT_ASSERT_EQ(ctx, exp_idx, found); \ + } \ +} + +static void test_find_closest(struct kunit *ctx) +{ + /* This will test a few arrays that are found in drivers */ + static const int ina226_avg_tab[] = { 1, 4, 16, 64, 128, 256, 512, 1024 }; + static const unsigned int ad7616_oversampling_avail[] = { + 1, 2, 4, 8, 16, 32, 64, 128, + }; + static u32 wd_timeout_table[] = { 2, 4, 6, 8, 16, 32, 48, 64 }; + static int array_prog1a[] = { 1, 2, 3, 4, 5 }; + static u32 array_prog1b[] = { 2, 3, 4, 5, 6 }; + static int array_prog1mix[] = { -2, -1, 0, 1, 2 }; + static int array_prog2a[] = { 1, 3, 5, 7 }; + static u32 array_prog2b[] = { 2, 4, 6, 8 }; + static int array_prog3a[] = { 1, 4, 7, 10 }; + static u32 array_prog3b[] = { 2, 5, 8, 11 }; + static int array_prog4a[] = { 1, 5, 9, 13 }; + static u32 array_prog4b[] = { 2, 6, 10, 14 }; + + FIND_CLOSEST_RANGE_CHECK(-3, 2, ina226_avg_tab, 0); + FIND_CLOSEST_RANGE_CHECK(3, 10, ina226_avg_tab, 1); + FIND_CLOSEST_RANGE_CHECK(11, 40, ina226_avg_tab, 2); + FIND_CLOSEST_RANGE_CHECK(41, 96, ina226_avg_tab, 3); + FIND_CLOSEST_RANGE_CHECK(97, 192, ina226_avg_tab, 4); + FIND_CLOSEST_RANGE_CHECK(193, 384, ina226_avg_tab, 5); + FIND_CLOSEST_RANGE_CHECK(385, 768, ina226_avg_tab, 6); + FIND_CLOSEST_RANGE_CHECK(769, 2048, ina226_avg_tab, 7); + + /* The array that found the bug that caused this kunit to exist */ + FIND_CLOSEST_RANGE_CHECK(-3, 1, ad7616_oversampling_avail, 0); + FIND_CLOSEST_RANGE_CHECK(2, 3, ad7616_oversampling_avail, 1); + FIND_CLOSEST_RANGE_CHECK(4, 6, ad7616_oversampling_avail, 2); + FIND_CLOSEST_RANGE_CHECK(7, 12, ad7616_oversampling_avail, 3); + FIND_CLOSEST_RANGE_CHECK(13, 24, ad7616_oversampling_avail, 4); + FIND_CLOSEST_RANGE_CHECK(25, 48, ad7616_oversampling_avail, 5); + FIND_CLOSEST_RANGE_CHECK(49, 96, ad7616_oversampling_avail, 6); + FIND_CLOSEST_RANGE_CHECK(97, 256, ad7616_oversampling_avail, 7); + + FIND_CLOSEST_RANGE_CHECK(-3, 3, wd_timeout_table, 0); + FIND_CLOSEST_RANGE_CHECK(4, 5, wd_timeout_table, 1); + FIND_CLOSEST_RANGE_CHECK(6, 7, wd_timeout_table, 2); + FIND_CLOSEST_RANGE_CHECK(8, 12, wd_timeout_table, 3); + FIND_CLOSEST_RANGE_CHECK(13, 24, wd_timeout_table, 4); + FIND_CLOSEST_RANGE_CHECK(25, 40, wd_timeout_table, 5); + FIND_CLOSEST_RANGE_CHECK(41, 56, wd_timeout_table, 6); + FIND_CLOSEST_RANGE_CHECK(57, 128, wd_timeout_table, 7); + + /* One could argue that find_closest() should not be used for monotonic + * arrays (like 1,2,3,4,5), but even so, it should work as long as the + * array is sorted ascending. */ + FIND_CLOSEST_RANGE_CHECK(-3, 1, array_prog1a, 0); + FIND_CLOSEST_RANGE_CHECK(2, 2, array_prog1a, 1); + FIND_CLOSEST_RANGE_CHECK(3, 3, array_prog1a, 2); + FIND_CLOSEST_RANGE_CHECK(4, 4, array_prog1a, 3); + FIND_CLOSEST_RANGE_CHECK(5, 8, array_prog1a, 4); + + FIND_CLOSEST_RANGE_CHECK(-3, 2, array_prog1b, 0); + FIND_CLOSEST_RANGE_CHECK(3, 3, array_prog1b, 1); + FIND_CLOSEST_RANGE_CHECK(4, 4, array_prog1b, 2); + FIND_CLOSEST_RANGE_CHECK(5, 5, array_prog1b, 3); + FIND_CLOSEST_RANGE_CHECK(6, 8, array_prog1b, 4); + + FIND_CLOSEST_RANGE_CHECK(-4, -2, array_prog1mix, 0); + FIND_CLOSEST_RANGE_CHECK(-1, -1, array_prog1mix, 1); + FIND_CLOSEST_RANGE_CHECK(0, 0, array_prog1mix, 2); + FIND_CLOSEST_RANGE_CHECK(1, 1, array_prog1mix, 3); + FIND_CLOSEST_RANGE_CHECK(2, 5, array_prog1mix, 4); + + FIND_CLOSEST_RANGE_CHECK(-3, 2, array_prog2a, 0); + FIND_CLOSEST_RANGE_CHECK(3, 4, array_prog2a, 1); + FIND_CLOSEST_RANGE_CHECK(5, 6, array_prog2a, 2); + FIND_CLOSEST_RANGE_CHECK(7, 10, array_prog2a, 3); + + FIND_CLOSEST_RANGE_CHECK(-3, 3, array_prog2b, 0); + FIND_CLOSEST_RANGE_CHECK(4, 5, array_prog2b, 1); + FIND_CLOSEST_RANGE_CHECK(6, 7, array_prog2b, 2); + FIND_CLOSEST_RANGE_CHECK(8, 10, array_prog2b, 3); + + FIND_CLOSEST_RANGE_CHECK(-3, 2, array_prog3a, 0); + FIND_CLOSEST_RANGE_CHECK(3, 5, array_prog3a, 1); + FIND_CLOSEST_RANGE_CHECK(6, 8, array_prog3a, 2); + FIND_CLOSEST_RANGE_CHECK(9, 20, array_prog3a, 3); + + FIND_CLOSEST_RANGE_CHECK(-3, 3, array_prog3b, 0); + FIND_CLOSEST_RANGE_CHECK(4, 6, array_prog3b, 1); + FIND_CLOSEST_RANGE_CHECK(7, 9, array_prog3b, 2); + FIND_CLOSEST_RANGE_CHECK(10, 20, array_prog3b, 3); + + FIND_CLOSEST_RANGE_CHECK(-3, 3, array_prog4a, 0); + FIND_CLOSEST_RANGE_CHECK(4, 7, array_prog4a, 1); + FIND_CLOSEST_RANGE_CHECK(8, 11, array_prog4a, 2); + FIND_CLOSEST_RANGE_CHECK(12, 20, array_prog4a, 3); + + FIND_CLOSEST_RANGE_CHECK(-3, 4, array_prog4b, 0); + FIND_CLOSEST_RANGE_CHECK(5, 8, array_prog4b, 1); + FIND_CLOSEST_RANGE_CHECK(9, 12, array_prog4b, 2); + FIND_CLOSEST_RANGE_CHECK(13, 20, array_prog4b, 3); +} + +#define FIND_CLOSEST_DESC_RANGE_CHECK(from, to, array, exp_idx) \ +{ \ + int i; \ + for (i = from; i <= to; i++) { \ + int found = find_closest_descending(i, array, \ + ARRAY_SIZE(array)); \ + KUNIT_ASSERT_EQ(ctx, exp_idx, found); \ + } \ +} + +static void test_find_closest_descending(struct kunit *ctx) +{ + /* Same arrays as 'test_find_closest' but reversed */ + static const int ina226_avg_tab[] = { 1024, 512, 256, 128, 64, 16, 4, 1 }; + static const unsigned int ad7616_oversampling_avail[] = { + 128, 64, 32, 16, 8, 4, 2, 1 + }; + static u32 wd_timeout_table[] = { 64, 48, 32, 16, 8, 6, 4, 2 }; + static int array_prog1a[] = { 5, 4, 3, 2, 1 }; + static u32 array_prog1b[] = { 6, 5, 4, 3, 2 }; + static int array_prog1mix[] = { 2, 1, 0, -1, -2 }; + static int array_prog2a[] = { 7, 5, 3, 1 }; + static u32 array_prog2b[] = { 8, 6, 4, 2 }; + static int array_prog3a[] = { 10, 7, 4, 1 }; + static u32 array_prog3b[] = { 11, 8, 5, 2 }; + static int array_prog4a[] = { 13, 9, 5, 1 }; + static u32 array_prog4b[] = { 14, 10, 6, 2 }; + + FIND_CLOSEST_DESC_RANGE_CHECK(-3, 2, ina226_avg_tab, 7); + FIND_CLOSEST_DESC_RANGE_CHECK(3, 10, ina226_avg_tab, 6); + FIND_CLOSEST_DESC_RANGE_CHECK(11, 40, ina226_avg_tab, 5); + FIND_CLOSEST_DESC_RANGE_CHECK(41, 96, ina226_avg_tab, 4); + FIND_CLOSEST_DESC_RANGE_CHECK(97, 192, ina226_avg_tab, 3); + FIND_CLOSEST_DESC_RANGE_CHECK(193, 384, ina226_avg_tab, 2); + FIND_CLOSEST_DESC_RANGE_CHECK(385, 768, ina226_avg_tab, 1); + FIND_CLOSEST_DESC_RANGE_CHECK(769, 2048, ina226_avg_tab, 0); + + FIND_CLOSEST_DESC_RANGE_CHECK(-3, 1, ad7616_oversampling_avail, 7); + FIND_CLOSEST_DESC_RANGE_CHECK(2, 3, ad7616_oversampling_avail, 6); + FIND_CLOSEST_DESC_RANGE_CHECK(4, 6, ad7616_oversampling_avail, 5); + FIND_CLOSEST_DESC_RANGE_CHECK(7, 12, ad7616_oversampling_avail, 4); + FIND_CLOSEST_DESC_RANGE_CHECK(13, 24, ad7616_oversampling_avail, 3); + FIND_CLOSEST_DESC_RANGE_CHECK(25, 48, ad7616_oversampling_avail, 2); + FIND_CLOSEST_DESC_RANGE_CHECK(49, 96, ad7616_oversampling_avail, 1); + FIND_CLOSEST_DESC_RANGE_CHECK(97, 256, ad7616_oversampling_avail, 0); + + FIND_CLOSEST_DESC_RANGE_CHECK(-3, 3, wd_timeout_table, 7); + FIND_CLOSEST_DESC_RANGE_CHECK(4, 5, wd_timeout_table, 6); + FIND_CLOSEST_DESC_RANGE_CHECK(6, 7, wd_timeout_table, 5); + FIND_CLOSEST_DESC_RANGE_CHECK(8, 12, wd_timeout_table, 4); + FIND_CLOSEST_DESC_RANGE_CHECK(13, 24, wd_timeout_table, 3); + FIND_CLOSEST_DESC_RANGE_CHECK(25, 40, wd_timeout_table, 2); + FIND_CLOSEST_DESC_RANGE_CHECK(41, 56, wd_timeout_table, 1); + FIND_CLOSEST_DESC_RANGE_CHECK(57, 128, wd_timeout_table, 0); + + /* One could argue that find_closest_descending() should not be used + * for monotonic arrays (like 5,4,3,2,1), but even so, it should still + * it should work as long as the array is sorted descending. */ + FIND_CLOSEST_DESC_RANGE_CHECK(-3, 1, array_prog1a, 4); + FIND_CLOSEST_DESC_RANGE_CHECK(2, 2, array_prog1a, 3); + FIND_CLOSEST_DESC_RANGE_CHECK(3, 3, array_prog1a, 2); + FIND_CLOSEST_DESC_RANGE_CHECK(4, 4, array_prog1a, 1); + FIND_CLOSEST_DESC_RANGE_CHECK(5, 8, array_prog1a, 0); + + FIND_CLOSEST_DESC_RANGE_CHECK(-3, 2, array_prog1b, 4); + FIND_CLOSEST_DESC_RANGE_CHECK(3, 3, array_prog1b, 3); + FIND_CLOSEST_DESC_RANGE_CHECK(4, 4, array_prog1b, 2); + FIND_CLOSEST_DESC_RANGE_CHECK(5, 5, array_prog1b, 1); + FIND_CLOSEST_DESC_RANGE_CHECK(6, 8, array_prog1b, 0); + + FIND_CLOSEST_DESC_RANGE_CHECK(-4, -2, array_prog1mix, 4); + FIND_CLOSEST_DESC_RANGE_CHECK(-1, -1, array_prog1mix, 3); + FIND_CLOSEST_DESC_RANGE_CHECK(0, 0, array_prog1mix, 2); + FIND_CLOSEST_DESC_RANGE_CHECK(1, 1, array_prog1mix, 1); + FIND_CLOSEST_DESC_RANGE_CHECK(2, 5, array_prog1mix, 0); + + FIND_CLOSEST_DESC_RANGE_CHECK(-3, 2, array_prog2a, 3); + FIND_CLOSEST_DESC_RANGE_CHECK(3, 4, array_prog2a, 2); + FIND_CLOSEST_DESC_RANGE_CHECK(5, 6, array_prog2a, 1); + FIND_CLOSEST_DESC_RANGE_CHECK(7, 10, array_prog2a, 0); + + FIND_CLOSEST_DESC_RANGE_CHECK(-3, 3, array_prog2b, 3); + FIND_CLOSEST_DESC_RANGE_CHECK(4, 5, array_prog2b, 2); + FIND_CLOSEST_DESC_RANGE_CHECK(6, 7, array_prog2b, 1); + FIND_CLOSEST_DESC_RANGE_CHECK(8, 10, array_prog2b, 0); + + FIND_CLOSEST_DESC_RANGE_CHECK(-3, 2, array_prog3a, 3); + FIND_CLOSEST_DESC_RANGE_CHECK(3, 5, array_prog3a, 2); + FIND_CLOSEST_DESC_RANGE_CHECK(6, 8, array_prog3a, 1); + FIND_CLOSEST_DESC_RANGE_CHECK(9, 20, array_prog3a, 0); + + FIND_CLOSEST_DESC_RANGE_CHECK(-3, 3, array_prog3b, 3); + FIND_CLOSEST_DESC_RANGE_CHECK(4, 6, array_prog3b, 2); + FIND_CLOSEST_DESC_RANGE_CHECK(7, 9, array_prog3b, 1); + FIND_CLOSEST_DESC_RANGE_CHECK(10, 20, array_prog3b, 0); + + FIND_CLOSEST_DESC_RANGE_CHECK(-3, 3, array_prog4a, 3); + FIND_CLOSEST_DESC_RANGE_CHECK(4, 7, array_prog4a, 2); + FIND_CLOSEST_DESC_RANGE_CHECK(8, 11, array_prog4a, 1); + FIND_CLOSEST_DESC_RANGE_CHECK(12, 20, array_prog4a, 0); + + FIND_CLOSEST_DESC_RANGE_CHECK(-3, 4, array_prog4b, 3); + FIND_CLOSEST_DESC_RANGE_CHECK(5, 8, array_prog4b, 2); + FIND_CLOSEST_DESC_RANGE_CHECK(9, 12, array_prog4b, 1); + FIND_CLOSEST_DESC_RANGE_CHECK(13, 20, array_prog4b, 0); +} + +static struct kunit_case __refdata util_macros_test_cases[] = { + KUNIT_CASE(test_find_closest), + KUNIT_CASE(test_find_closest_descending), + {} +}; + +static struct kunit_suite util_macros_test_suite = { + .name = "util_macros.h", + .test_cases = util_macros_test_cases, +}; + +kunit_test_suites(&util_macros_test_suite); + +MODULE_AUTHOR("Alexandru Ardelean <aardelean@baylibre.com>"); +MODULE_DESCRIPTION("Test cases for util_macros.h helpers"); +MODULE_LICENSE("GPL"); From 45dac1959bbdc498a2abb89919221455225789dc Mon Sep 17 00:00:00 2001 From: zhangguopeng <zhangguopeng@kylinos.cn> Date: Tue, 5 Nov 2024 17:49:41 +0800 Subject: [PATCH 70/71] kernel/reboot: replace sprintf() with sysfs_emit() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As Documentation/filesystems/sysfs.rst suggested, show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. No functional change intended. Link: https://lkml.kernel.org/r/20241105094941.33739-1-zhangguopeng@kylinos.cn Signed-off-by: zhangguopeng <zhangguopeng@kylinos.cn> Cc: Daniel Lezcano <daniel.lezcano@linaro.org> Cc: Fabio Estevam <festevam@denx.de> Cc: Joel Granados <joel.granados@kernel.org> Cc: Thomas Weißschuh <linux@weissschuh.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- kernel/reboot.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/reboot.c b/kernel/reboot.c index ffdf86b717ab..a701000bab34 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -1137,7 +1137,7 @@ static ssize_t mode_show(struct kobject *kobj, struct kobj_attribute *attr, char val = REBOOT_UNDEFINED_STR; } - return sprintf(buf, "%s\n", val); + return sysfs_emit(buf, "%s\n", val); } static ssize_t mode_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) @@ -1167,7 +1167,7 @@ static struct kobj_attribute reboot_mode_attr = __ATTR_RW(mode); #ifdef CONFIG_X86 static ssize_t force_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", reboot_force); + return sysfs_emit(buf, "%d\n", reboot_force); } static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) @@ -1214,7 +1214,7 @@ static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, char val = REBOOT_UNDEFINED_STR; } - return sprintf(buf, "%s\n", val); + return sysfs_emit(buf, "%s\n", val); } static ssize_t type_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) @@ -1247,7 +1247,7 @@ static struct kobj_attribute reboot_type_attr = __ATTR_RW(type); #ifdef CONFIG_SMP static ssize_t cpu_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", reboot_cpu); + return sysfs_emit(buf, "%d\n", reboot_cpu); } static ssize_t cpu_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) From 2c259a91d8d23a8266092b0dd51b8092877717a4 Mon Sep 17 00:00:00 2001 From: Etienne Buira <etienne.buira@free.fr> Date: Fri, 8 Nov 2024 17:45:32 +0100 Subject: [PATCH 71/71] gdb: lx-symbols: do not error out on monolithic build This avoids spurious message: (gdb) lx-symbols loading vmlinux No source file named kernel/module/main.c. Link: https://lkml.kernel.org/r/Zy5ALByQtpO-ddh4@Z926fQmE5jqhFMgp6 Signed-off-by: Etienne Buira <etienne.buira@free.fr> Cc: Andrew Ballance <andrewjballance@gmail.com> Cc: Kieran Bingham <kbingham@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- scripts/gdb/linux/modules.py | 3 +++ scripts/gdb/linux/symbols.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/scripts/gdb/linux/modules.py b/scripts/gdb/linux/modules.py index 298dfcc25eae..fa15f872ddbe 100644 --- a/scripts/gdb/linux/modules.py +++ b/scripts/gdb/linux/modules.py @@ -19,6 +19,9 @@ from linux import cpus, utils, lists, constants module_type = utils.CachedType("struct module") +def has_modules(): + return utils.gdb_eval_or_none("modules") is not None + def module_list(): global module_type modules = utils.gdb_eval_or_none("modules") diff --git a/scripts/gdb/linux/symbols.py b/scripts/gdb/linux/symbols.py index e8316beb17a7..f6c1b063775a 100644 --- a/scripts/gdb/linux/symbols.py +++ b/scripts/gdb/linux/symbols.py @@ -178,6 +178,9 @@ lx-symbols command.""" self.load_all_symbols() + if not modules.has_modules(): + return + if hasattr(gdb, 'Breakpoint'): if self.breakpoint is not None: self.breakpoint.delete()