Skip to content

Commit dc84bc2

Browse files
davidhildenbrandIngo Molnar
authored and
Ingo Molnar
committed
x86/mm/pat: Fix VM_PAT handling when fork() fails in copy_page_range()
If track_pfn_copy() fails, we already added the dst VMA to the maple tree. As fork() fails, we'll cleanup the maple tree, and stumble over the dst VMA for which we neither performed any reservation nor copied any page tables. Consequently untrack_pfn() will see VM_PAT and try obtaining the PAT information from the page table -- which fails because the page table was not copied. The easiest fix would be to simply clear the VM_PAT flag of the dst VMA if track_pfn_copy() fails. However, the whole thing is about "simply" clearing the VM_PAT flag is shaky as well: if we passed track_pfn_copy() and performed a reservation, but copying the page tables fails, we'll simply clear the VM_PAT flag, not properly undoing the reservation ... which is also wrong. So let's fix it properly: set the VM_PAT flag only if the reservation succeeded (leaving it clear initially), and undo the reservation if anything goes wrong while copying the page tables: clearing the VM_PAT flag after undoing the reservation. Note that any copied page table entries will get zapped when the VMA will get removed later, after copy_page_range() succeeded; as VM_PAT is not set then, we won't try cleaning VM_PAT up once more and untrack_pfn() will be happy. Note that leaving these page tables in place without a reservation is not a problem, as we are aborting fork(); this process will never run. A reproducer can trigger this usually at the first try: https://gitlab.com/davidhildenbrand/scratchspace/-/raw/main/reproducers/pat_fork.c WARNING: CPU: 26 PID: 11650 at arch/x86/mm/pat/memtype.c:983 get_pat_info+0xf6/0x110 Modules linked in: ... CPU: 26 UID: 0 PID: 11650 Comm: repro3 Not tainted 6.12.0-rc5+ #92 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-2.fc40 04/01/2014 RIP: 0010:get_pat_info+0xf6/0x110 ... Call Trace: <TASK> ... untrack_pfn+0x52/0x110 unmap_single_vma+0xa6/0xe0 unmap_vmas+0x105/0x1f0 exit_mmap+0xf6/0x460 __mmput+0x4b/0x120 copy_process+0x1bf6/0x2aa0 kernel_clone+0xab/0x440 __do_sys_clone+0x66/0x90 do_syscall_64+0x95/0x180 Likely this case was missed in: d155df5 ("x86/mm/pat: clear VM_PAT if copy_p4d_range failed") ... and instead of undoing the reservation we simply cleared the VM_PAT flag. Keep the documentation of these functions in include/linux/pgtable.h, one place is more than sufficient -- we should clean that up for the other functions like track_pfn_remap/untrack_pfn separately. Fixes: d155df5 ("x86/mm/pat: clear VM_PAT if copy_p4d_range failed") Fixes: 2ab6403 ("x86: PAT: hooks in generic vm code to help archs to track pfnmap regions - v3") Reported-by: xingwei lee <[email protected]> Reported-by: yuxin wang <[email protected]> Reported-by: Marius Fleischer <[email protected]> Signed-off-by: David Hildenbrand <[email protected]> Signed-off-by: Ingo Molnar <[email protected]> Cc: Andy Lutomirski <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Rik van Riel <[email protected]> Cc: "H. Peter Anvin" <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Andrew Morton <[email protected]> Cc: [email protected] Link: https://lore.kernel.org/r/[email protected] Closes: https://lore.kernel.org/lkml/CABOYnLx_dnqzpCW99G81DmOr+2UzdmZMk=T3uxwNxwz+R1RAwg@mail.gmail.com/ Closes: https://lore.kernel.org/lkml/CAJg=8jwijTP5fre8woS4JVJQ8iUA6v+iNcsOgtj9Zfpc3obDOQ@mail.gmail.com/
1 parent 878477a commit dc84bc2

File tree

4 files changed

+58
-37
lines changed

4 files changed

+58
-37
lines changed

arch/x86/mm/pat/memtype.c

+28-24
Original file line numberDiff line numberDiff line change
@@ -984,29 +984,42 @@ static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
984984
return -EINVAL;
985985
}
986986

987-
/*
988-
* track_pfn_copy is called when vma that is covering the pfnmap gets
989-
* copied through copy_page_range().
990-
*
991-
* If the vma has a linear pfn mapping for the entire range, we get the prot
992-
* from pte and reserve the entire vma range with single reserve_pfn_range call.
993-
*/
994-
int track_pfn_copy(struct vm_area_struct *vma)
987+
int track_pfn_copy(struct vm_area_struct *dst_vma,
988+
struct vm_area_struct *src_vma, unsigned long *pfn)
995989
{
990+
const unsigned long vma_size = src_vma->vm_end - src_vma->vm_start;
996991
resource_size_t paddr;
997-
unsigned long vma_size = vma->vm_end - vma->vm_start;
998992
pgprot_t pgprot;
993+
int rc;
999994

1000-
if (vma->vm_flags & VM_PAT) {
1001-
if (get_pat_info(vma, &paddr, &pgprot))
1002-
return -EINVAL;
1003-
/* reserve the whole chunk covered by vma. */
1004-
return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
1005-
}
995+
if (!(src_vma->vm_flags & VM_PAT))
996+
return 0;
997+
998+
/*
999+
* Duplicate the PAT information for the dst VMA based on the src
1000+
* VMA.
1001+
*/
1002+
if (get_pat_info(src_vma, &paddr, &pgprot))
1003+
return -EINVAL;
1004+
rc = reserve_pfn_range(paddr, vma_size, &pgprot, 1);
1005+
if (rc)
1006+
return rc;
10061007

1008+
/* Reservation for the destination VMA succeeded. */
1009+
vm_flags_set(dst_vma, VM_PAT);
1010+
*pfn = PHYS_PFN(paddr);
10071011
return 0;
10081012
}
10091013

1014+
void untrack_pfn_copy(struct vm_area_struct *dst_vma, unsigned long pfn)
1015+
{
1016+
untrack_pfn(dst_vma, pfn, dst_vma->vm_end - dst_vma->vm_start, true);
1017+
/*
1018+
* Reservation was freed, any copied page tables will get cleaned
1019+
* up later, but without getting PAT involved again.
1020+
*/
1021+
}
1022+
10101023
/*
10111024
* prot is passed in as a parameter for the new mapping. If the vma has
10121025
* a linear pfn mapping for the entire range, or no vma is provided,
@@ -1095,15 +1108,6 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
10951108
}
10961109
}
10971110

1098-
/*
1099-
* untrack_pfn_clear is called if the following situation fits:
1100-
*
1101-
* 1) while mremapping a pfnmap for a new region, with the old vma after
1102-
* its pfnmap page table has been removed. The new vma has a new pfnmap
1103-
* to the same pfn & cache type with VM_PAT set.
1104-
* 2) while duplicating vm area, the new vma fails to copy the pgtable from
1105-
* old vma.
1106-
*/
11071111
void untrack_pfn_clear(struct vm_area_struct *vma)
11081112
{
11091113
vm_flags_clear(vma, VM_PAT);

include/linux/pgtable.h

+22-6
Original file line numberDiff line numberDiff line change
@@ -1508,14 +1508,25 @@ static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
15081508
}
15091509

15101510
/*
1511-
* track_pfn_copy is called when vma that is covering the pfnmap gets
1512-
* copied through copy_page_range().
1511+
* track_pfn_copy is called when a VM_PFNMAP VMA is about to get the page
1512+
* tables copied during copy_page_range(). On success, stores the pfn to be
1513+
* passed to untrack_pfn_copy().
15131514
*/
1514-
static inline int track_pfn_copy(struct vm_area_struct *vma)
1515+
static inline int track_pfn_copy(struct vm_area_struct *dst_vma,
1516+
struct vm_area_struct *src_vma, unsigned long *pfn)
15151517
{
15161518
return 0;
15171519
}
15181520

1521+
/*
1522+
* untrack_pfn_copy is called when a VM_PFNMAP VMA failed to copy during
1523+
* copy_page_range(), but after track_pfn_copy() was already called.
1524+
*/
1525+
static inline void untrack_pfn_copy(struct vm_area_struct *dst_vma,
1526+
unsigned long pfn)
1527+
{
1528+
}
1529+
15191530
/*
15201531
* untrack_pfn is called while unmapping a pfnmap for a region.
15211532
* untrack can be called for a specific region indicated by pfn and size or
@@ -1528,8 +1539,10 @@ static inline void untrack_pfn(struct vm_area_struct *vma,
15281539
}
15291540

15301541
/*
1531-
* untrack_pfn_clear is called while mremapping a pfnmap for a new region
1532-
* or fails to copy pgtable during duplicate vm area.
1542+
* untrack_pfn_clear is called in the following cases on a VM_PFNMAP VMA:
1543+
*
1544+
* 1) During mremap() on the src VMA after the page tables were moved.
1545+
* 2) During fork() on the dst VMA, immediately after duplicating the src VMA.
15331546
*/
15341547
static inline void untrack_pfn_clear(struct vm_area_struct *vma)
15351548
{
@@ -1540,7 +1553,10 @@ extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
15401553
unsigned long size);
15411554
extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
15421555
pfn_t pfn);
1543-
extern int track_pfn_copy(struct vm_area_struct *vma);
1556+
extern int track_pfn_copy(struct vm_area_struct *dst_vma,
1557+
struct vm_area_struct *src_vma, unsigned long *pfn);
1558+
extern void untrack_pfn_copy(struct vm_area_struct *dst_vma,
1559+
unsigned long pfn);
15441560
extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
15451561
unsigned long size, bool mm_wr_locked);
15461562
extern void untrack_pfn_clear(struct vm_area_struct *vma);

kernel/fork.c

+4
Original file line numberDiff line numberDiff line change
@@ -504,6 +504,10 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
504504
vma_numab_state_init(new);
505505
dup_anon_vma_name(orig, new);
506506

507+
/* track_pfn_copy() will later take care of copying internal state. */
508+
if (unlikely(new->vm_flags & VM_PFNMAP))
509+
untrack_pfn_clear(new);
510+
507511
return new;
508512
}
509513

mm/memory.c

+4-7
Original file line numberDiff line numberDiff line change
@@ -1362,12 +1362,12 @@ int
13621362
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
13631363
{
13641364
pgd_t *src_pgd, *dst_pgd;
1365-
unsigned long next;
13661365
unsigned long addr = src_vma->vm_start;
13671366
unsigned long end = src_vma->vm_end;
13681367
struct mm_struct *dst_mm = dst_vma->vm_mm;
13691368
struct mm_struct *src_mm = src_vma->vm_mm;
13701369
struct mmu_notifier_range range;
1370+
unsigned long next, pfn;
13711371
bool is_cow;
13721372
int ret;
13731373

@@ -1378,11 +1378,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
13781378
return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
13791379

13801380
if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
1381-
/*
1382-
* We do not free on error cases below as remove_vma
1383-
* gets called on error from higher level routine
1384-
*/
1385-
ret = track_pfn_copy(src_vma);
1381+
ret = track_pfn_copy(dst_vma, src_vma, &pfn);
13861382
if (ret)
13871383
return ret;
13881384
}
@@ -1419,7 +1415,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
14191415
continue;
14201416
if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
14211417
addr, next))) {
1422-
untrack_pfn_clear(dst_vma);
14231418
ret = -ENOMEM;
14241419
break;
14251420
}
@@ -1429,6 +1424,8 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
14291424
raw_write_seqcount_end(&src_mm->write_protect_seq);
14301425
mmu_notifier_invalidate_range_end(&range);
14311426
}
1427+
if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP))
1428+
untrack_pfn_copy(dst_vma, pfn);
14321429
return ret;
14331430
}
14341431

0 commit comments

Comments
 (0)