Note that the vcpu ioctl is asynchronous to vcpu execution.
+4.78 KVM_PPC_GET_HTAB_FD
+
+Capability: KVM_CAP_PPC_HTAB_FD
+Architectures: powerpc
+Type: vm ioctl
+Parameters: Pointer to struct kvm_get_htab_fd (in)
+Returns: file descriptor number (>= 0) on success, -1 on error
+
+This returns a file descriptor that can be used either to read out the
+entries in the guest's hashed page table (HPT), or to write entries to
+initialize the HPT. The returned fd can only be written to if the
+KVM_GET_HTAB_WRITE bit is set in the flags field of the argument, and
+can only be read if that bit is clear. The argument struct looks like
+this:
+
+/* For KVM_PPC_GET_HTAB_FD */
+struct kvm_get_htab_fd {
+ __u64 flags;
+ __u64 start_index;
+ __u64 reserved[2];
+};
+
+/* Values for kvm_get_htab_fd.flags */
+#define KVM_GET_HTAB_BOLTED_ONLY ((__u64)0x1)
+#define KVM_GET_HTAB_WRITE ((__u64)0x2)
+
+The `start_index' field gives the index in the HPT of the entry at
+which to start reading. It is ignored when writing.
+
+Reads on the fd will initially supply information about all
+"interesting" HPT entries. Interesting entries are those with the
+bolted bit set, if the KVM_GET_HTAB_BOLTED_ONLY bit is set, otherwise
+all entries. When the end of the HPT is reached, the read() will
+return. If read() is called again on the fd, it will start again from
+the beginning of the HPT, but will only return HPT entries that have
+changed since they were last read.
+
+Data read or written is structured as a header (8 bytes) followed by a
+series of valid HPT entries (16 bytes) each. The header indicates how
+many valid HPT entries there are and how many invalid entries follow
+the valid entries. The invalid entries are not represented explicitly
+in the stream. The header format is:
+
+struct kvm_get_htab_header {
+ __u32 index;
+ __u16 n_valid;
+ __u16 n_invalid;
+};
+
+Writes to the fd create HPT entries starting at the index given in the
+header; first `n_valid' valid entries with contents from the data
+written, then `n_invalid' invalid entries, invalidating any previously
+valid entries found.
+
5. The kvm_run structure
------------------------
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
#include <linux/srcu.h>
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
#include <asm/tlbflush.h>
#include <asm/kvm_ppc.h>
put_page(page);
}
+/*
+ * Functions for reading and writing the hash table via reads and
+ * writes on a file descriptor.
+ *
+ * Reads return the guest view of the hash table, which has to be
+ * pieced together from the real hash table and the guest_rpte
+ * values in the revmap array.
+ *
+ * On writes, each HPTE written is considered in turn, and if it
+ * is valid, it is written to the HPT as if an H_ENTER with the
+ * exact flag set was done. When the invalid count is non-zero
+ * in the header written to the stream, the kernel will make
+ * sure that that many HPTEs are invalid, and invalidate them
+ * if not.
+ */
+
+struct kvm_htab_ctx {
+ unsigned long index;
+ unsigned long flags;
+ struct kvm *kvm;
+ int first_pass;
+};
+
+#define HPTE_SIZE (2 * sizeof(unsigned long))
+
+static long record_hpte(unsigned long flags, unsigned long *hptp,
+ unsigned long *hpte, struct revmap_entry *revp,
+ int want_valid, int first_pass)
+{
+ unsigned long v, r;
+ int ok = 1;
+ int valid, dirty;
+
+ /* Unmodified entries are uninteresting except on the first pass */
+ dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
+ if (!first_pass && !dirty)
+ return 0;
+
+ valid = 0;
+ if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) {
+ valid = 1;
+ if ((flags & KVM_GET_HTAB_BOLTED_ONLY) &&
+ !(hptp[0] & HPTE_V_BOLTED))
+ valid = 0;
+ }
+ if (valid != want_valid)
+ return 0;
+
+ v = r = 0;
+ if (valid || dirty) {
+ /* lock the HPTE so it's stable and read it */
+ preempt_disable();
+ while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
+ cpu_relax();
+ v = hptp[0];
+ if (v & HPTE_V_ABSENT) {
+ v &= ~HPTE_V_ABSENT;
+ v |= HPTE_V_VALID;
+ }
+ /* re-evaluate valid and dirty from synchronized HPTE value */
+ valid = !!(v & HPTE_V_VALID);
+ if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED))
+ valid = 0;
+ r = revp->guest_rpte | (hptp[1] & (HPTE_R_R | HPTE_R_C));
+ dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
+ /* only clear modified if this is the right sort of entry */
+ if (valid == want_valid && dirty) {
+ r &= ~HPTE_GR_MODIFIED;
+ revp->guest_rpte = r;
+ }
+ asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
+ hptp[0] &= ~HPTE_V_HVLOCK;
+ preempt_enable();
+ if (!(valid == want_valid && (first_pass || dirty)))
+ ok = 0;
+ }
+ hpte[0] = v;
+ hpte[1] = r;
+ return ok;
+}
+
+static ssize_t kvm_htab_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct kvm_htab_ctx *ctx = file->private_data;
+ struct kvm *kvm = ctx->kvm;
+ struct kvm_get_htab_header hdr;
+ unsigned long *hptp;
+ struct revmap_entry *revp;
+ unsigned long i, nb, nw;
+ unsigned long __user *lbuf;
+ struct kvm_get_htab_header __user *hptr;
+ unsigned long flags;
+ int first_pass;
+ unsigned long hpte[2];
+
+ if (!access_ok(VERIFY_WRITE, buf, count))
+ return -EFAULT;
+
+ first_pass = ctx->first_pass;
+ flags = ctx->flags;
+
+ i = ctx->index;
+ hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
+ revp = kvm->arch.revmap + i;
+ lbuf = (unsigned long __user *)buf;
+
+ nb = 0;
+ while (nb + sizeof(hdr) + HPTE_SIZE < count) {
+ /* Initialize header */
+ hptr = (struct kvm_get_htab_header __user *)buf;
+ hdr.index = i;
+ hdr.n_valid = 0;
+ hdr.n_invalid = 0;
+ nw = nb;
+ nb += sizeof(hdr);
+ lbuf = (unsigned long __user *)(buf + sizeof(hdr));
+
+ /* Skip uninteresting entries, i.e. clean on not-first pass */
+ if (!first_pass) {
+ while (i < kvm->arch.hpt_npte &&
+ !(revp->guest_rpte & HPTE_GR_MODIFIED)) {
+ ++i;
+ hptp += 2;
+ ++revp;
+ }
+ }
+
+ /* Grab a series of valid entries */
+ while (i < kvm->arch.hpt_npte &&
+ hdr.n_valid < 0xffff &&
+ nb + HPTE_SIZE < count &&
+ record_hpte(flags, hptp, hpte, revp, 1, first_pass)) {
+ /* valid entry, write it out */
+ ++hdr.n_valid;
+ if (__put_user(hpte[0], lbuf) ||
+ __put_user(hpte[1], lbuf + 1))
+ return -EFAULT;
+ nb += HPTE_SIZE;
+ lbuf += 2;
+ ++i;
+ hptp += 2;
+ ++revp;
+ }
+ /* Now skip invalid entries while we can */
+ while (i < kvm->arch.hpt_npte &&
+ hdr.n_invalid < 0xffff &&
+ record_hpte(flags, hptp, hpte, revp, 0, first_pass)) {
+ /* found an invalid entry */
+ ++hdr.n_invalid;
+ ++i;
+ hptp += 2;
+ ++revp;
+ }
+
+ if (hdr.n_valid || hdr.n_invalid) {
+ /* write back the header */
+ if (__copy_to_user(hptr, &hdr, sizeof(hdr)))
+ return -EFAULT;
+ nw = nb;
+ buf = (char __user *)lbuf;
+ } else {
+ nb = nw;
+ }
+
+ /* Check if we've wrapped around the hash table */
+ if (i >= kvm->arch.hpt_npte) {
+ i = 0;
+ ctx->first_pass = 0;
+ break;
+ }
+ }
+
+ ctx->index = i;
+
+ return nb;
+}
+
+static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct kvm_htab_ctx *ctx = file->private_data;
+ struct kvm *kvm = ctx->kvm;
+ struct kvm_get_htab_header hdr;
+ unsigned long i, j;
+ unsigned long v, r;
+ unsigned long __user *lbuf;
+ unsigned long *hptp;
+ unsigned long tmp[2];
+ ssize_t nb;
+ long int err, ret;
+ int rma_setup;
+
+ if (!access_ok(VERIFY_READ, buf, count))
+ return -EFAULT;
+
+ /* lock out vcpus from running while we're doing this */
+ mutex_lock(&kvm->lock);
+ rma_setup = kvm->arch.rma_setup_done;
+ if (rma_setup) {
+ kvm->arch.rma_setup_done = 0; /* temporarily */
+ /* order rma_setup_done vs. vcpus_running */
+ smp_mb();
+ if (atomic_read(&kvm->arch.vcpus_running)) {
+ kvm->arch.rma_setup_done = 1;
+ mutex_unlock(&kvm->lock);
+ return -EBUSY;
+ }
+ }
+
+ err = 0;
+ for (nb = 0; nb + sizeof(hdr) <= count; ) {
+ err = -EFAULT;
+ if (__copy_from_user(&hdr, buf, sizeof(hdr)))
+ break;
+
+ err = 0;
+ if (nb + hdr.n_valid * HPTE_SIZE > count)
+ break;
+
+ nb += sizeof(hdr);
+ buf += sizeof(hdr);
+
+ err = -EINVAL;
+ i = hdr.index;
+ if (i >= kvm->arch.hpt_npte ||
+ i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte)
+ break;
+
+ hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
+ lbuf = (unsigned long __user *)buf;
+ for (j = 0; j < hdr.n_valid; ++j) {
+ err = -EFAULT;
+ if (__get_user(v, lbuf) || __get_user(r, lbuf + 1))
+ goto out;
+ err = -EINVAL;
+ if (!(v & HPTE_V_VALID))
+ goto out;
+ lbuf += 2;
+ nb += HPTE_SIZE;
+
+ if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT))
+ kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
+ err = -EIO;
+ ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r,
+ tmp);
+ if (ret != H_SUCCESS) {
+ pr_err("kvm_htab_write ret %ld i=%ld v=%lx "
+ "r=%lx\n", ret, i, v, r);
+ goto out;
+ }
+ if (!rma_setup && is_vrma_hpte(v)) {
+ unsigned long psize = hpte_page_size(v, r);
+ unsigned long senc = slb_pgsize_encoding(psize);
+ unsigned long lpcr;
+
+ kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
+ (VRMA_VSID << SLB_VSID_SHIFT_1T);
+ lpcr = kvm->arch.lpcr & ~LPCR_VRMASD;
+ lpcr |= senc << (LPCR_VRMASD_SH - 4);
+ kvm->arch.lpcr = lpcr;
+ rma_setup = 1;
+ }
+ ++i;
+ hptp += 2;
+ }
+
+ for (j = 0; j < hdr.n_invalid; ++j) {
+ if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT))
+ kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
+ ++i;
+ hptp += 2;
+ }
+ err = 0;
+ }
+
+ out:
+ /* Order HPTE updates vs. rma_setup_done */
+ smp_wmb();
+ kvm->arch.rma_setup_done = rma_setup;
+ mutex_unlock(&kvm->lock);
+
+ if (err)
+ return err;
+ return nb;
+}
+
+static int kvm_htab_release(struct inode *inode, struct file *filp)
+{
+ struct kvm_htab_ctx *ctx = filp->private_data;
+
+ filp->private_data = NULL;
+ if (!(ctx->flags & KVM_GET_HTAB_WRITE))
+ atomic_dec(&ctx->kvm->arch.hpte_mod_interest);
+ kvm_put_kvm(ctx->kvm);
+ kfree(ctx);
+ return 0;
+}
+
+static struct file_operations kvm_htab_fops = {
+ .read = kvm_htab_read,
+ .write = kvm_htab_write,
+ .llseek = default_llseek,
+ .release = kvm_htab_release,
+};
+
+int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
+{
+ int ret;
+ struct kvm_htab_ctx *ctx;
+ int rwflag;
+
+ /* reject flags we don't recognize */
+ if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE))
+ return -EINVAL;
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ kvm_get_kvm(kvm);
+ ctx->kvm = kvm;
+ ctx->index = ghf->start_index;
+ ctx->flags = ghf->flags;
+ ctx->first_pass = 1;
+
+ rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY;
+ ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag);
+ if (ret < 0) {
+ kvm_put_kvm(kvm);
+ return ret;
+ }
+
+ if (rwflag == O_RDONLY) {
+ mutex_lock(&kvm->slots_lock);
+ atomic_inc(&kvm->arch.hpte_mod_interest);
+ /* make sure kvmppc_do_h_enter etc. see the increment */
+ synchronize_srcu_expedited(&kvm->srcu);
+ mutex_unlock(&kvm->slots_lock);
+ }
+
+ return ret;
+}
+
void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
{
struct kvmppc_mmu *mmu = &vcpu->arch.mmu;