@@ -179,7 +179,6 @@ struct kvm_shadow_walk_iterator {
179179
180180static struct kmem_cache * pte_list_desc_cache ;
181181struct kmem_cache * mmu_page_header_cache ;
182- static struct percpu_counter kvm_total_used_mmu_pages ;
183182
184183static void mmu_spte_set (u64 * sptep , u64 spte );
185184
@@ -485,11 +484,12 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
485484 __set_spte (sptep , new_spte );
486485}
487486
488- /*
489- * Update the SPTE (excluding the PFN), but do not track changes in its
490- * accessed/dirty status.
487+ /* Rules for using mmu_spte_update:
488+ * Update the state bits, it means the mapped pfn is not changed.
489+ *
490+ * Returns true if the TLB needs to be flushed
491491 */
492- static u64 mmu_spte_update_no_track (u64 * sptep , u64 new_spte )
492+ static bool mmu_spte_update (u64 * sptep , u64 new_spte )
493493{
494494 u64 old_spte = * sptep ;
495495
@@ -498,57 +498,18 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
498498
499499 if (!is_shadow_present_pte (old_spte )) {
500500 mmu_spte_set (sptep , new_spte );
501- return old_spte ;
501+ return false ;
502502 }
503503
504504 if (!spte_has_volatile_bits (old_spte ))
505505 __update_clear_spte_fast (sptep , new_spte );
506506 else
507507 old_spte = __update_clear_spte_slow (sptep , new_spte );
508508
509- WARN_ON_ONCE (spte_to_pfn (old_spte ) != spte_to_pfn (new_spte ));
510-
511- return old_spte ;
512- }
513-
514- /* Rules for using mmu_spte_update:
515- * Update the state bits, it means the mapped pfn is not changed.
516- *
517- * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote
518- * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only
519- * spte, even though the writable spte might be cached on a CPU's TLB.
520- *
521- * Returns true if the TLB needs to be flushed
522- */
523- static bool mmu_spte_update (u64 * sptep , u64 new_spte )
524- {
525- bool flush = false;
526- u64 old_spte = mmu_spte_update_no_track (sptep , new_spte );
527-
528- if (!is_shadow_present_pte (old_spte ))
529- return false;
530-
531- /*
532- * For the spte updated out of mmu-lock is safe, since
533- * we always atomically update it, see the comments in
534- * spte_has_volatile_bits().
535- */
536- if (is_mmu_writable_spte (old_spte ) &&
537- !is_writable_pte (new_spte ))
538- flush = true;
539-
540- /*
541- * Flush TLB when accessed/dirty states are changed in the page tables,
542- * to guarantee consistency between TLB and page tables.
543- */
544-
545- if (is_accessed_spte (old_spte ) && !is_accessed_spte (new_spte ))
546- flush = true;
547-
548- if (is_dirty_spte (old_spte ) && !is_dirty_spte (new_spte ))
549- flush = true;
509+ WARN_ON_ONCE (!is_shadow_present_pte (old_spte ) ||
510+ spte_to_pfn (old_spte ) != spte_to_pfn (new_spte ));
550511
551- return flush ;
512+ return leaf_spte_change_needs_tlb_flush ( old_spte , new_spte ) ;
552513}
553514
554515/*
@@ -1606,8 +1567,13 @@ static bool kvm_rmap_age_gfn_range(struct kvm *kvm,
16061567 clear_bit ((ffs (shadow_accessed_mask ) - 1 ),
16071568 (unsigned long * )sptep );
16081569 } else {
1570+ /*
1571+ * WARN if mmu_spte_update() signals the need
1572+ * for a TLB flush, as Access tracking a SPTE
1573+ * should never trigger an _immediate_ flush.
1574+ */
16091575 spte = mark_spte_for_access_track (spte );
1610- mmu_spte_update_no_track ( sptep , spte );
1576+ WARN_ON_ONCE ( mmu_spte_update ( sptep , spte ) );
16111577 }
16121578 young = true;
16131579 }
@@ -1655,27 +1621,15 @@ static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp)
16551621#endif
16561622}
16571623
1658- /*
1659- * This value is the sum of all of the kvm instances's
1660- * kvm->arch.n_used_mmu_pages values. We need a global,
1661- * aggregate version in order to make the slab shrinker
1662- * faster
1663- */
1664- static inline void kvm_mod_used_mmu_pages (struct kvm * kvm , long nr )
1665- {
1666- kvm -> arch .n_used_mmu_pages += nr ;
1667- percpu_counter_add (& kvm_total_used_mmu_pages , nr );
1668- }
1669-
16701624static void kvm_account_mmu_page (struct kvm * kvm , struct kvm_mmu_page * sp )
16711625{
1672- kvm_mod_used_mmu_pages ( kvm , +1 ) ;
1626+ kvm -> arch . n_used_mmu_pages ++ ;
16731627 kvm_account_pgtable_pages ((void * )sp -> spt , +1 );
16741628}
16751629
16761630static void kvm_unaccount_mmu_page (struct kvm * kvm , struct kvm_mmu_page * sp )
16771631{
1678- kvm_mod_used_mmu_pages ( kvm , -1 ) ;
1632+ kvm -> arch . n_used_mmu_pages -- ;
16791633 kvm_account_pgtable_pages ((void * )sp -> spt , -1 );
16801634}
16811635
@@ -3147,13 +3101,12 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
31473101}
31483102
31493103int kvm_mmu_max_mapping_level (struct kvm * kvm ,
3150- const struct kvm_memory_slot * slot , gfn_t gfn ,
3151- int max_level )
3104+ const struct kvm_memory_slot * slot , gfn_t gfn )
31523105{
31533106 bool is_private = kvm_slot_can_be_private (slot ) &&
31543107 kvm_mem_is_private (kvm , gfn );
31553108
3156- return __kvm_mmu_max_mapping_level (kvm , slot , gfn , max_level , is_private );
3109+ return __kvm_mmu_max_mapping_level (kvm , slot , gfn , PG_LEVEL_NUM , is_private );
31573110}
31583111
31593112void kvm_mmu_hugepage_adjust (struct kvm_vcpu * vcpu , struct kvm_page_fault * fault )
@@ -3373,7 +3326,7 @@ static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault
33733326 * by setting the Writable bit, which can be done out of mmu_lock.
33743327 */
33753328 if (!fault -> present )
3376- return !kvm_ad_enabled () ;
3329+ return !kvm_ad_enabled ;
33773330
33783331 /*
33793332 * Note, instruction fetches and writes are mutually exclusive, ignore
@@ -3508,8 +3461,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
35083461 * uses A/D bits for non-nested MMUs. Thus, if A/D bits are
35093462 * enabled, the SPTE can't be an access-tracked SPTE.
35103463 */
3511- if (unlikely (!kvm_ad_enabled ()) && is_access_track_spte (spte ))
3512- new_spte = restore_acc_track_spte (new_spte );
3464+ if (unlikely (!kvm_ad_enabled ) && is_access_track_spte (spte ))
3465+ new_spte = restore_acc_track_spte (new_spte ) |
3466+ shadow_accessed_mask ;
35133467
35143468 /*
35153469 * To keep things simple, only SPTEs that are MMU-writable can
@@ -5485,7 +5439,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
54855439 role .efer_nx = true;
54865440 role .smm = cpu_role .base .smm ;
54875441 role .guest_mode = cpu_role .base .guest_mode ;
5488- role .ad_disabled = !kvm_ad_enabled () ;
5442+ role .ad_disabled = !kvm_ad_enabled ;
54895443 role .level = kvm_mmu_get_tdp_level (vcpu );
54905444 role .direct = true;
54915445 role .has_4_byte_gpte = false;
@@ -6413,8 +6367,11 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
64136367{
64146368 struct kvm_mmu_page * sp , * node ;
64156369 int nr_zapped , batch = 0 ;
6370+ LIST_HEAD (invalid_list );
64166371 bool unstable ;
64176372
6373+ lockdep_assert_held (& kvm -> slots_lock );
6374+
64186375restart :
64196376 list_for_each_entry_safe_reverse (sp , node ,
64206377 & kvm -> arch .active_mmu_pages , link ) {
@@ -6446,7 +6403,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
64466403 }
64476404
64486405 unstable = __kvm_mmu_prepare_zap_page (kvm , sp ,
6449- & kvm -> arch . zapped_obsolete_pages , & nr_zapped );
6406+ & invalid_list , & nr_zapped );
64506407 batch += nr_zapped ;
64516408
64526409 if (unstable )
@@ -6462,7 +6419,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
64626419 * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
64636420 * running with an obsolete MMU.
64646421 */
6465- kvm_mmu_commit_zap_page (kvm , & kvm -> arch . zapped_obsolete_pages );
6422+ kvm_mmu_commit_zap_page (kvm , & invalid_list );
64666423}
64676424
64686425/*
@@ -6525,16 +6482,10 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
65256482 kvm_tdp_mmu_zap_invalidated_roots (kvm );
65266483}
65276484
6528- static bool kvm_has_zapped_obsolete_pages (struct kvm * kvm )
6529- {
6530- return unlikely (!list_empty_careful (& kvm -> arch .zapped_obsolete_pages ));
6531- }
6532-
65336485void kvm_mmu_init_vm (struct kvm * kvm )
65346486{
65356487 kvm -> arch .shadow_mmio_value = shadow_mmio_value ;
65366488 INIT_LIST_HEAD (& kvm -> arch .active_mmu_pages );
6537- INIT_LIST_HEAD (& kvm -> arch .zapped_obsolete_pages );
65386489 INIT_LIST_HEAD (& kvm -> arch .possible_nx_huge_pages );
65396490 spin_lock_init (& kvm -> arch .mmu_unsync_pages_lock );
65406491
@@ -6768,7 +6719,7 @@ static void shadow_mmu_split_huge_page(struct kvm *kvm,
67686719 continue ;
67696720 }
67706721
6771- spte = make_huge_page_split_spte (kvm , huge_spte , sp -> role , index );
6722+ spte = make_small_spte (kvm , huge_spte , sp -> role , index );
67726723 mmu_spte_set (sptep , spte );
67736724 __rmap_add (kvm , cache , slot , sptep , gfn , sp -> role .access );
67746725 }
@@ -6951,8 +6902,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
69516902 * mapping if the indirect sp has level = 1.
69526903 */
69536904 if (sp -> role .direct &&
6954- sp -> role .level < kvm_mmu_max_mapping_level (kvm , slot , sp -> gfn ,
6955- PG_LEVEL_NUM )) {
6905+ sp -> role .level < kvm_mmu_max_mapping_level (kvm , slot , sp -> gfn )) {
69566906 kvm_zap_one_rmap_spte (kvm , rmap_head , sptep );
69576907
69586908 if (kvm_available_flush_remote_tlbs_range ())
@@ -6980,8 +6930,8 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
69806930 kvm_flush_remote_tlbs_memslot (kvm , slot );
69816931}
69826932
6983- void kvm_mmu_zap_collapsible_sptes (struct kvm * kvm ,
6984- const struct kvm_memory_slot * slot )
6933+ void kvm_mmu_recover_huge_pages (struct kvm * kvm ,
6934+ const struct kvm_memory_slot * slot )
69856935{
69866936 if (kvm_memslots_have_rmaps (kvm )) {
69876937 write_lock (& kvm -> mmu_lock );
@@ -6991,7 +6941,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
69916941
69926942 if (tdp_mmu_enabled ) {
69936943 read_lock (& kvm -> mmu_lock );
6994- kvm_tdp_mmu_zap_collapsible_sptes (kvm , slot );
6944+ kvm_tdp_mmu_recover_huge_pages (kvm , slot );
69956945 read_unlock (& kvm -> mmu_lock );
69966946 }
69976947}
@@ -7146,72 +7096,6 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
71467096 }
71477097}
71487098
7149- static unsigned long mmu_shrink_scan (struct shrinker * shrink ,
7150- struct shrink_control * sc )
7151- {
7152- struct kvm * kvm ;
7153- int nr_to_scan = sc -> nr_to_scan ;
7154- unsigned long freed = 0 ;
7155-
7156- mutex_lock (& kvm_lock );
7157-
7158- list_for_each_entry (kvm , & vm_list , vm_list ) {
7159- int idx ;
7160-
7161- /*
7162- * Never scan more than sc->nr_to_scan VM instances.
7163- * Will not hit this condition practically since we do not try
7164- * to shrink more than one VM and it is very unlikely to see
7165- * !n_used_mmu_pages so many times.
7166- */
7167- if (!nr_to_scan -- )
7168- break ;
7169- /*
7170- * n_used_mmu_pages is accessed without holding kvm->mmu_lock
7171- * here. We may skip a VM instance errorneosly, but we do not
7172- * want to shrink a VM that only started to populate its MMU
7173- * anyway.
7174- */
7175- if (!kvm -> arch .n_used_mmu_pages &&
7176- !kvm_has_zapped_obsolete_pages (kvm ))
7177- continue ;
7178-
7179- idx = srcu_read_lock (& kvm -> srcu );
7180- write_lock (& kvm -> mmu_lock );
7181-
7182- if (kvm_has_zapped_obsolete_pages (kvm )) {
7183- kvm_mmu_commit_zap_page (kvm ,
7184- & kvm -> arch .zapped_obsolete_pages );
7185- goto unlock ;
7186- }
7187-
7188- freed = kvm_mmu_zap_oldest_mmu_pages (kvm , sc -> nr_to_scan );
7189-
7190- unlock :
7191- write_unlock (& kvm -> mmu_lock );
7192- srcu_read_unlock (& kvm -> srcu , idx );
7193-
7194- /*
7195- * unfair on small ones
7196- * per-vm shrinkers cry out
7197- * sadness comes quickly
7198- */
7199- list_move_tail (& kvm -> vm_list , & vm_list );
7200- break ;
7201- }
7202-
7203- mutex_unlock (& kvm_lock );
7204- return freed ;
7205- }
7206-
7207- static unsigned long mmu_shrink_count (struct shrinker * shrink ,
7208- struct shrink_control * sc )
7209- {
7210- return percpu_counter_read_positive (& kvm_total_used_mmu_pages );
7211- }
7212-
7213- static struct shrinker * mmu_shrinker ;
7214-
72157099static void mmu_destroy_caches (void )
72167100{
72177101 kmem_cache_destroy (pte_list_desc_cache );
@@ -7338,23 +7222,8 @@ int kvm_mmu_vendor_module_init(void)
73387222 if (!mmu_page_header_cache )
73397223 goto out ;
73407224
7341- if (percpu_counter_init (& kvm_total_used_mmu_pages , 0 , GFP_KERNEL ))
7342- goto out ;
7343-
7344- mmu_shrinker = shrinker_alloc (0 , "x86-mmu" );
7345- if (!mmu_shrinker )
7346- goto out_shrinker ;
7347-
7348- mmu_shrinker -> count_objects = mmu_shrink_count ;
7349- mmu_shrinker -> scan_objects = mmu_shrink_scan ;
7350- mmu_shrinker -> seeks = DEFAULT_SEEKS * 10 ;
7351-
7352- shrinker_register (mmu_shrinker );
7353-
73547225 return 0 ;
73557226
7356- out_shrinker :
7357- percpu_counter_destroy (& kvm_total_used_mmu_pages );
73587227out :
73597228 mmu_destroy_caches ();
73607229 return ret ;
@@ -7371,8 +7240,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
73717240void kvm_mmu_vendor_module_exit (void )
73727241{
73737242 mmu_destroy_caches ();
7374- percpu_counter_destroy (& kvm_total_used_mmu_pages );
7375- shrinker_free (mmu_shrinker );
73767243}
73777244
73787245/*
0 commit comments