@@ -447,6 +447,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
447447 vcpu -> kvm = kvm ;
448448 vcpu -> vcpu_id = id ;
449449 vcpu -> pid = NULL ;
450+ rwlock_init (& vcpu -> pid_lock );
450451#ifndef __KVM_HAVE_ARCH_WQP
451452 rcuwait_init (& vcpu -> wait );
452453#endif
@@ -474,7 +475,7 @@ static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
474475 * the vcpu->pid pointer, and at destruction time all file descriptors
475476 * are already gone.
476477 */
477- put_pid (rcu_dereference_protected ( vcpu -> pid , 1 ) );
478+ put_pid (vcpu -> pid );
478479
479480 free_page ((unsigned long )vcpu -> run );
480481 kmem_cache_free (kvm_vcpu_cache , vcpu );
@@ -3770,17 +3771,19 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
37703771
37713772int kvm_vcpu_yield_to (struct kvm_vcpu * target )
37723773{
3773- struct pid * pid ;
37743774 struct task_struct * task = NULL ;
3775- int ret = 0 ;
3775+ int ret ;
3776+
3777+ if (!read_trylock (& target -> pid_lock ))
3778+ return 0 ;
3779+
3780+ if (target -> pid )
3781+ task = get_pid_task (target -> pid , PIDTYPE_PID );
3782+
3783+ read_unlock (& target -> pid_lock );
37763784
3777- rcu_read_lock ();
3778- pid = rcu_dereference (target -> pid );
3779- if (pid )
3780- task = get_pid_task (pid , PIDTYPE_PID );
3781- rcu_read_unlock ();
37823785 if (!task )
3783- return ret ;
3786+ return 0 ;
37843787 ret = yield_to (task , 1 );
37853788 put_task_struct (task );
37863789
@@ -3869,59 +3872,71 @@ bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
38693872
38703873void kvm_vcpu_on_spin (struct kvm_vcpu * me , bool yield_to_kernel_mode )
38713874{
3875+ int nr_vcpus , start , i , idx , yielded ;
38723876 struct kvm * kvm = me -> kvm ;
38733877 struct kvm_vcpu * vcpu ;
3874- int last_boosted_vcpu ;
3875- unsigned long i ;
3876- int yielded = 0 ;
38773878 int try = 3 ;
3878- int pass ;
38793879
3880- last_boosted_vcpu = READ_ONCE (kvm -> last_boosted_vcpu );
3880+ nr_vcpus = atomic_read (& kvm -> online_vcpus );
3881+ if (nr_vcpus < 2 )
3882+ return ;
3883+
3884+ /* Pairs with the smp_wmb() in kvm_vm_ioctl_create_vcpu(). */
3885+ smp_rmb ();
3886+
38813887 kvm_vcpu_set_in_spin_loop (me , true);
3888+
38823889 /*
3883- * We boost the priority of a VCPU that is runnable but not
3884- * currently running, because it got preempted by something
3885- * else and called schedule in __vcpu_run. Hopefully that
3886- * VCPU is holding the lock that we need and will release it.
3887- * We approximate round-robin by starting at the last boosted VCPU.
3890+ * The current vCPU ("me") is spinning in kernel mode, i.e. is likely
3891+ * waiting for a resource to become available. Attempt to yield to a
3892+ * vCPU that is runnable, but not currently running, e.g. because the
3893+ * vCPU was preempted by a higher priority task. With luck, the vCPU
3894+ * that was preempted is holding a lock or some other resource that the
3895+ * current vCPU is waiting to acquire, and yielding to the other vCPU
3896+ * will allow it to make forward progress and release the lock (or kick
3897+ * the spinning vCPU, etc).
3898+ *
3899+ * Since KVM has no insight into what exactly the guest is doing,
3900+ * approximate a round-robin selection by iterating over all vCPUs,
3901+ * starting at the last boosted vCPU. I.e. if N=kvm->last_boosted_vcpu,
3902+ * iterate over vCPU[N+1]..vCPU[N-1], wrapping as needed.
3903+ *
3904+ * Note, this is inherently racy, e.g. if multiple vCPUs are spinning,
3905+ * they may all try to yield to the same vCPU(s). But as above, this
3906+ * is all best effort due to KVM's lack of visibility into the guest.
38883907 */
3889- for (pass = 0 ; pass < 2 && !yielded && try ; pass ++ ) {
3890- kvm_for_each_vcpu (i , vcpu , kvm ) {
3891- if (!pass && i <= last_boosted_vcpu ) {
3892- i = last_boosted_vcpu ;
3893- continue ;
3894- } else if (pass && i > last_boosted_vcpu )
3895- break ;
3896- if (!READ_ONCE (vcpu -> ready ))
3897- continue ;
3898- if (vcpu == me )
3899- continue ;
3900- if (kvm_vcpu_is_blocking (vcpu ) && !vcpu_dy_runnable (vcpu ))
3901- continue ;
3908+ start = READ_ONCE (kvm -> last_boosted_vcpu ) + 1 ;
3909+ for (i = 0 ; i < nr_vcpus ; i ++ ) {
3910+ idx = (start + i ) % nr_vcpus ;
3911+ if (idx == me -> vcpu_idx )
3912+ continue ;
39023913
3903- /*
3904- * Treat the target vCPU as being in-kernel if it has a
3905- * pending interrupt, as the vCPU trying to yield may
3906- * be spinning waiting on IPI delivery, i.e. the target
3907- * vCPU is in-kernel for the purposes of directed yield.
3908- */
3909- if (READ_ONCE (vcpu -> preempted ) && yield_to_kernel_mode &&
3910- !kvm_arch_dy_has_pending_interrupt (vcpu ) &&
3911- !kvm_arch_vcpu_preempted_in_kernel (vcpu ))
3912- continue ;
3913- if (!kvm_vcpu_eligible_for_directed_yield (vcpu ))
3914- continue ;
3914+ vcpu = xa_load (& kvm -> vcpu_array , idx );
3915+ if (!READ_ONCE (vcpu -> ready ))
3916+ continue ;
3917+ if (kvm_vcpu_is_blocking (vcpu ) && !vcpu_dy_runnable (vcpu ))
3918+ continue ;
39153919
3916- yielded = kvm_vcpu_yield_to (vcpu );
3917- if (yielded > 0 ) {
3918- WRITE_ONCE (kvm -> last_boosted_vcpu , i );
3919- break ;
3920- } else if (yielded < 0 ) {
3921- try -- ;
3922- if (!try )
3923- break ;
3924- }
3920+ /*
3921+ * Treat the target vCPU as being in-kernel if it has a pending
3922+ * interrupt, as the vCPU trying to yield may be spinning
3923+ * waiting on IPI delivery, i.e. the target vCPU is in-kernel
3924+ * for the purposes of directed yield.
3925+ */
3926+ if (READ_ONCE (vcpu -> preempted ) && yield_to_kernel_mode &&
3927+ !kvm_arch_dy_has_pending_interrupt (vcpu ) &&
3928+ !kvm_arch_vcpu_preempted_in_kernel (vcpu ))
3929+ continue ;
3930+
3931+ if (!kvm_vcpu_eligible_for_directed_yield (vcpu ))
3932+ continue ;
3933+
3934+ yielded = kvm_vcpu_yield_to (vcpu );
3935+ if (yielded > 0 ) {
3936+ WRITE_ONCE (kvm -> last_boosted_vcpu , i );
3937+ break ;
3938+ } else if (yielded < 0 && !-- try ) {
3939+ break ;
39253940 }
39263941 }
39273942 kvm_vcpu_set_in_spin_loop (me , false);
@@ -4018,9 +4033,9 @@ static int vcpu_get_pid(void *data, u64 *val)
40184033{
40194034 struct kvm_vcpu * vcpu = data ;
40204035
4021- rcu_read_lock ( );
4022- * val = pid_nr (rcu_dereference ( vcpu -> pid ) );
4023- rcu_read_unlock ( );
4036+ read_lock ( & vcpu -> pid_lock );
4037+ * val = pid_nr (vcpu -> pid );
4038+ read_unlock ( & vcpu -> pid_lock );
40244039 return 0 ;
40254040}
40264041
@@ -4306,7 +4321,14 @@ static long kvm_vcpu_ioctl(struct file *filp,
43064321 r = - EINVAL ;
43074322 if (arg )
43084323 goto out ;
4309- oldpid = rcu_access_pointer (vcpu -> pid );
4324+
4325+ /*
4326+ * Note, vcpu->pid is primarily protected by vcpu->mutex. The
4327+ * dedicated r/w lock allows other tasks, e.g. other vCPUs, to
4328+ * read vcpu->pid while this vCPU is in KVM_RUN, e.g. to yield
4329+ * directly to this vCPU
4330+ */
4331+ oldpid = vcpu -> pid ;
43104332 if (unlikely (oldpid != task_pid (current ))) {
43114333 /* The thread running this VCPU changed. */
43124334 struct pid * newpid ;
@@ -4316,9 +4338,10 @@ static long kvm_vcpu_ioctl(struct file *filp,
43164338 break ;
43174339
43184340 newpid = get_task_pid (current , PIDTYPE_PID );
4319- rcu_assign_pointer (vcpu -> pid , newpid );
4320- if (oldpid )
4321- synchronize_rcu ();
4341+ write_lock (& vcpu -> pid_lock );
4342+ vcpu -> pid = newpid ;
4343+ write_unlock (& vcpu -> pid_lock );
4344+
43224345 put_pid (oldpid );
43234346 }
43244347 vcpu -> wants_to_run = !READ_ONCE (vcpu -> run -> immediate_exit__unsafe );
0 commit comments