Merge tag 'usb-3.9-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / drivers / iommu / intel-iommu.c
1 /*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
22 */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48
49 #include "irq_remapping.h"
50
51 #define ROOT_SIZE VTD_PAGE_SIZE
52 #define CONTEXT_SIZE VTD_PAGE_SIZE
53
54 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
55 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
56 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
57
58 #define IOAPIC_RANGE_START (0xfee00000)
59 #define IOAPIC_RANGE_END (0xfeefffff)
60 #define IOVA_START_ADDR (0x1000)
61
62 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
63
64 #define MAX_AGAW_WIDTH 64
65
66 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
67 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
68
69 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
70 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
71 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
72 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
73 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
74
75 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
76 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
77 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
78
79 /* page table handling */
80 #define LEVEL_STRIDE (9)
81 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
82
83 /*
84 * This bitmap is used to advertise the page sizes our hardware support
85 * to the IOMMU core, which will then use this information to split
86 * physically contiguous memory regions it is mapping into page sizes
87 * that we support.
88 *
89 * Traditionally the IOMMU core just handed us the mappings directly,
90 * after making sure the size is an order of a 4KiB page and that the
91 * mapping has natural alignment.
92 *
93 * To retain this behavior, we currently advertise that we support
94 * all page sizes that are an order of 4KiB.
95 *
96 * If at some point we'd like to utilize the IOMMU core's new behavior,
97 * we could change this to advertise the real page sizes we support.
98 */
99 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
100
101 static inline int agaw_to_level(int agaw)
102 {
103 return agaw + 2;
104 }
105
106 static inline int agaw_to_width(int agaw)
107 {
108 return 30 + agaw * LEVEL_STRIDE;
109 }
110
111 static inline int width_to_agaw(int width)
112 {
113 return (width - 30) / LEVEL_STRIDE;
114 }
115
116 static inline unsigned int level_to_offset_bits(int level)
117 {
118 return (level - 1) * LEVEL_STRIDE;
119 }
120
121 static inline int pfn_level_offset(unsigned long pfn, int level)
122 {
123 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
124 }
125
126 static inline unsigned long level_mask(int level)
127 {
128 return -1UL << level_to_offset_bits(level);
129 }
130
131 static inline unsigned long level_size(int level)
132 {
133 return 1UL << level_to_offset_bits(level);
134 }
135
136 static inline unsigned long align_to_level(unsigned long pfn, int level)
137 {
138 return (pfn + level_size(level) - 1) & level_mask(level);
139 }
140
141 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
142 {
143 return 1 << ((lvl - 1) * LEVEL_STRIDE);
144 }
145
146 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
147 are never going to work. */
148 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
149 {
150 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
151 }
152
153 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
154 {
155 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157 static inline unsigned long page_to_dma_pfn(struct page *pg)
158 {
159 return mm_to_dma_pfn(page_to_pfn(pg));
160 }
161 static inline unsigned long virt_to_dma_pfn(void *p)
162 {
163 return page_to_dma_pfn(virt_to_page(p));
164 }
165
166 /* global iommu list, set NULL for ignored DMAR units */
167 static struct intel_iommu **g_iommus;
168
169 static void __init check_tylersburg_isoch(void);
170 static int rwbf_quirk;
171
172 /*
173 * set to 1 to panic kernel if can't successfully enable VT-d
174 * (used when kernel is launched w/ TXT)
175 */
176 static int force_on = 0;
177
178 /*
179 * 0: Present
180 * 1-11: Reserved
181 * 12-63: Context Ptr (12 - (haw-1))
182 * 64-127: Reserved
183 */
184 struct root_entry {
185 u64 val;
186 u64 rsvd1;
187 };
188 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
189 static inline bool root_present(struct root_entry *root)
190 {
191 return (root->val & 1);
192 }
193 static inline void set_root_present(struct root_entry *root)
194 {
195 root->val |= 1;
196 }
197 static inline void set_root_value(struct root_entry *root, unsigned long value)
198 {
199 root->val |= value & VTD_PAGE_MASK;
200 }
201
202 static inline struct context_entry *
203 get_context_addr_from_root(struct root_entry *root)
204 {
205 return (struct context_entry *)
206 (root_present(root)?phys_to_virt(
207 root->val & VTD_PAGE_MASK) :
208 NULL);
209 }
210
211 /*
212 * low 64 bits:
213 * 0: present
214 * 1: fault processing disable
215 * 2-3: translation type
216 * 12-63: address space root
217 * high 64 bits:
218 * 0-2: address width
219 * 3-6: aval
220 * 8-23: domain id
221 */
222 struct context_entry {
223 u64 lo;
224 u64 hi;
225 };
226
227 static inline bool context_present(struct context_entry *context)
228 {
229 return (context->lo & 1);
230 }
231 static inline void context_set_present(struct context_entry *context)
232 {
233 context->lo |= 1;
234 }
235
236 static inline void context_set_fault_enable(struct context_entry *context)
237 {
238 context->lo &= (((u64)-1) << 2) | 1;
239 }
240
241 static inline void context_set_translation_type(struct context_entry *context,
242 unsigned long value)
243 {
244 context->lo &= (((u64)-1) << 4) | 3;
245 context->lo |= (value & 3) << 2;
246 }
247
248 static inline void context_set_address_root(struct context_entry *context,
249 unsigned long value)
250 {
251 context->lo |= value & VTD_PAGE_MASK;
252 }
253
254 static inline void context_set_address_width(struct context_entry *context,
255 unsigned long value)
256 {
257 context->hi |= value & 7;
258 }
259
260 static inline void context_set_domain_id(struct context_entry *context,
261 unsigned long value)
262 {
263 context->hi |= (value & ((1 << 16) - 1)) << 8;
264 }
265
266 static inline void context_clear_entry(struct context_entry *context)
267 {
268 context->lo = 0;
269 context->hi = 0;
270 }
271
272 /*
273 * 0: readable
274 * 1: writable
275 * 2-6: reserved
276 * 7: super page
277 * 8-10: available
278 * 11: snoop behavior
279 * 12-63: Host physcial address
280 */
281 struct dma_pte {
282 u64 val;
283 };
284
285 static inline void dma_clear_pte(struct dma_pte *pte)
286 {
287 pte->val = 0;
288 }
289
290 static inline void dma_set_pte_readable(struct dma_pte *pte)
291 {
292 pte->val |= DMA_PTE_READ;
293 }
294
295 static inline void dma_set_pte_writable(struct dma_pte *pte)
296 {
297 pte->val |= DMA_PTE_WRITE;
298 }
299
300 static inline void dma_set_pte_snp(struct dma_pte *pte)
301 {
302 pte->val |= DMA_PTE_SNP;
303 }
304
305 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
306 {
307 pte->val = (pte->val & ~3) | (prot & 3);
308 }
309
310 static inline u64 dma_pte_addr(struct dma_pte *pte)
311 {
312 #ifdef CONFIG_64BIT
313 return pte->val & VTD_PAGE_MASK;
314 #else
315 /* Must have a full atomic 64-bit read */
316 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
317 #endif
318 }
319
320 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
321 {
322 pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
323 }
324
325 static inline bool dma_pte_present(struct dma_pte *pte)
326 {
327 return (pte->val & 3) != 0;
328 }
329
330 static inline bool dma_pte_superpage(struct dma_pte *pte)
331 {
332 return (pte->val & (1 << 7));
333 }
334
335 static inline int first_pte_in_page(struct dma_pte *pte)
336 {
337 return !((unsigned long)pte & ~VTD_PAGE_MASK);
338 }
339
340 /*
341 * This domain is a statically identity mapping domain.
342 * 1. This domain creats a static 1:1 mapping to all usable memory.
343 * 2. It maps to each iommu if successful.
344 * 3. Each iommu mapps to this domain if successful.
345 */
346 static struct dmar_domain *si_domain;
347 static int hw_pass_through = 1;
348
349 /* devices under the same p2p bridge are owned in one domain */
350 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
351
352 /* domain represents a virtual machine, more than one devices
353 * across iommus may be owned in one domain, e.g. kvm guest.
354 */
355 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
356
357 /* si_domain contains mulitple devices */
358 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2)
359
360 /* define the limit of IOMMUs supported in each domain */
361 #ifdef CONFIG_X86
362 # define IOMMU_UNITS_SUPPORTED MAX_IO_APICS
363 #else
364 # define IOMMU_UNITS_SUPPORTED 64
365 #endif
366
367 struct dmar_domain {
368 int id; /* domain id */
369 int nid; /* node id */
370 DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
371 /* bitmap of iommus this domain uses*/
372
373 struct list_head devices; /* all devices' list */
374 struct iova_domain iovad; /* iova's that belong to this domain */
375
376 struct dma_pte *pgd; /* virtual address */
377 int gaw; /* max guest address width */
378
379 /* adjusted guest address width, 0 is level 2 30-bit */
380 int agaw;
381
382 int flags; /* flags to find out type of domain */
383
384 int iommu_coherency;/* indicate coherency of iommu access */
385 int iommu_snooping; /* indicate snooping control feature*/
386 int iommu_count; /* reference count of iommu */
387 int iommu_superpage;/* Level of superpages supported:
388 0 == 4KiB (no superpages), 1 == 2MiB,
389 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
390 spinlock_t iommu_lock; /* protect iommu set in domain */
391 u64 max_addr; /* maximum mapped address */
392 };
393
394 /* PCI domain-device relationship */
395 struct device_domain_info {
396 struct list_head link; /* link to domain siblings */
397 struct list_head global; /* link to global list */
398 int segment; /* PCI domain */
399 u8 bus; /* PCI bus number */
400 u8 devfn; /* PCI devfn number */
401 struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
402 struct intel_iommu *iommu; /* IOMMU used by this device */
403 struct dmar_domain *domain; /* pointer to domain */
404 };
405
406 static void flush_unmaps_timeout(unsigned long data);
407
408 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
409
410 #define HIGH_WATER_MARK 250
411 struct deferred_flush_tables {
412 int next;
413 struct iova *iova[HIGH_WATER_MARK];
414 struct dmar_domain *domain[HIGH_WATER_MARK];
415 };
416
417 static struct deferred_flush_tables *deferred_flush;
418
419 /* bitmap for indexing intel_iommus */
420 static int g_num_of_iommus;
421
422 static DEFINE_SPINLOCK(async_umap_flush_lock);
423 static LIST_HEAD(unmaps_to_do);
424
425 static int timer_on;
426 static long list_size;
427
428 static void domain_remove_dev_info(struct dmar_domain *domain);
429
430 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
431 int dmar_disabled = 0;
432 #else
433 int dmar_disabled = 1;
434 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
435
436 int intel_iommu_enabled = 0;
437 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
438
439 static int dmar_map_gfx = 1;
440 static int dmar_forcedac;
441 static int intel_iommu_strict;
442 static int intel_iommu_superpage = 1;
443
444 int intel_iommu_gfx_mapped;
445 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
446
447 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
448 static DEFINE_SPINLOCK(device_domain_lock);
449 static LIST_HEAD(device_domain_list);
450
451 static struct iommu_ops intel_iommu_ops;
452
453 static int __init intel_iommu_setup(char *str)
454 {
455 if (!str)
456 return -EINVAL;
457 while (*str) {
458 if (!strncmp(str, "on", 2)) {
459 dmar_disabled = 0;
460 printk(KERN_INFO "Intel-IOMMU: enabled\n");
461 } else if (!strncmp(str, "off", 3)) {
462 dmar_disabled = 1;
463 printk(KERN_INFO "Intel-IOMMU: disabled\n");
464 } else if (!strncmp(str, "igfx_off", 8)) {
465 dmar_map_gfx = 0;
466 printk(KERN_INFO
467 "Intel-IOMMU: disable GFX device mapping\n");
468 } else if (!strncmp(str, "forcedac", 8)) {
469 printk(KERN_INFO
470 "Intel-IOMMU: Forcing DAC for PCI devices\n");
471 dmar_forcedac = 1;
472 } else if (!strncmp(str, "strict", 6)) {
473 printk(KERN_INFO
474 "Intel-IOMMU: disable batched IOTLB flush\n");
475 intel_iommu_strict = 1;
476 } else if (!strncmp(str, "sp_off", 6)) {
477 printk(KERN_INFO
478 "Intel-IOMMU: disable supported super page\n");
479 intel_iommu_superpage = 0;
480 }
481
482 str += strcspn(str, ",");
483 while (*str == ',')
484 str++;
485 }
486 return 0;
487 }
488 __setup("intel_iommu=", intel_iommu_setup);
489
490 static struct kmem_cache *iommu_domain_cache;
491 static struct kmem_cache *iommu_devinfo_cache;
492 static struct kmem_cache *iommu_iova_cache;
493
494 static inline void *alloc_pgtable_page(int node)
495 {
496 struct page *page;
497 void *vaddr = NULL;
498
499 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
500 if (page)
501 vaddr = page_address(page);
502 return vaddr;
503 }
504
505 static inline void free_pgtable_page(void *vaddr)
506 {
507 free_page((unsigned long)vaddr);
508 }
509
510 static inline void *alloc_domain_mem(void)
511 {
512 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
513 }
514
515 static void free_domain_mem(void *vaddr)
516 {
517 kmem_cache_free(iommu_domain_cache, vaddr);
518 }
519
520 static inline void * alloc_devinfo_mem(void)
521 {
522 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
523 }
524
525 static inline void free_devinfo_mem(void *vaddr)
526 {
527 kmem_cache_free(iommu_devinfo_cache, vaddr);
528 }
529
530 struct iova *alloc_iova_mem(void)
531 {
532 return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
533 }
534
535 void free_iova_mem(struct iova *iova)
536 {
537 kmem_cache_free(iommu_iova_cache, iova);
538 }
539
540
541 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
542 {
543 unsigned long sagaw;
544 int agaw = -1;
545
546 sagaw = cap_sagaw(iommu->cap);
547 for (agaw = width_to_agaw(max_gaw);
548 agaw >= 0; agaw--) {
549 if (test_bit(agaw, &sagaw))
550 break;
551 }
552
553 return agaw;
554 }
555
556 /*
557 * Calculate max SAGAW for each iommu.
558 */
559 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
560 {
561 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
562 }
563
564 /*
565 * calculate agaw for each iommu.
566 * "SAGAW" may be different across iommus, use a default agaw, and
567 * get a supported less agaw for iommus that don't support the default agaw.
568 */
569 int iommu_calculate_agaw(struct intel_iommu *iommu)
570 {
571 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
572 }
573
574 /* This functionin only returns single iommu in a domain */
575 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
576 {
577 int iommu_id;
578
579 /* si_domain and vm domain should not get here. */
580 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
581 BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
582
583 iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
584 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
585 return NULL;
586
587 return g_iommus[iommu_id];
588 }
589
590 static void domain_update_iommu_coherency(struct dmar_domain *domain)
591 {
592 int i;
593
594 i = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
595
596 domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
597
598 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
599 if (!ecap_coherent(g_iommus[i]->ecap)) {
600 domain->iommu_coherency = 0;
601 break;
602 }
603 }
604 }
605
606 static void domain_update_iommu_snooping(struct dmar_domain *domain)
607 {
608 int i;
609
610 domain->iommu_snooping = 1;
611
612 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
613 if (!ecap_sc_support(g_iommus[i]->ecap)) {
614 domain->iommu_snooping = 0;
615 break;
616 }
617 }
618 }
619
620 static void domain_update_iommu_superpage(struct dmar_domain *domain)
621 {
622 struct dmar_drhd_unit *drhd;
623 struct intel_iommu *iommu = NULL;
624 int mask = 0xf;
625
626 if (!intel_iommu_superpage) {
627 domain->iommu_superpage = 0;
628 return;
629 }
630
631 /* set iommu_superpage to the smallest common denominator */
632 for_each_active_iommu(iommu, drhd) {
633 mask &= cap_super_page_val(iommu->cap);
634 if (!mask) {
635 break;
636 }
637 }
638 domain->iommu_superpage = fls(mask);
639 }
640
641 /* Some capabilities may be different across iommus */
642 static void domain_update_iommu_cap(struct dmar_domain *domain)
643 {
644 domain_update_iommu_coherency(domain);
645 domain_update_iommu_snooping(domain);
646 domain_update_iommu_superpage(domain);
647 }
648
649 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
650 {
651 struct dmar_drhd_unit *drhd = NULL;
652 int i;
653
654 for_each_drhd_unit(drhd) {
655 if (drhd->ignored)
656 continue;
657 if (segment != drhd->segment)
658 continue;
659
660 for (i = 0; i < drhd->devices_cnt; i++) {
661 if (drhd->devices[i] &&
662 drhd->devices[i]->bus->number == bus &&
663 drhd->devices[i]->devfn == devfn)
664 return drhd->iommu;
665 if (drhd->devices[i] &&
666 drhd->devices[i]->subordinate &&
667 drhd->devices[i]->subordinate->number <= bus &&
668 drhd->devices[i]->subordinate->busn_res.end >= bus)
669 return drhd->iommu;
670 }
671
672 if (drhd->include_all)
673 return drhd->iommu;
674 }
675
676 return NULL;
677 }
678
679 static void domain_flush_cache(struct dmar_domain *domain,
680 void *addr, int size)
681 {
682 if (!domain->iommu_coherency)
683 clflush_cache_range(addr, size);
684 }
685
686 /* Gets context entry for a given bus and devfn */
687 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
688 u8 bus, u8 devfn)
689 {
690 struct root_entry *root;
691 struct context_entry *context;
692 unsigned long phy_addr;
693 unsigned long flags;
694
695 spin_lock_irqsave(&iommu->lock, flags);
696 root = &iommu->root_entry[bus];
697 context = get_context_addr_from_root(root);
698 if (!context) {
699 context = (struct context_entry *)
700 alloc_pgtable_page(iommu->node);
701 if (!context) {
702 spin_unlock_irqrestore(&iommu->lock, flags);
703 return NULL;
704 }
705 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
706 phy_addr = virt_to_phys((void *)context);
707 set_root_value(root, phy_addr);
708 set_root_present(root);
709 __iommu_flush_cache(iommu, root, sizeof(*root));
710 }
711 spin_unlock_irqrestore(&iommu->lock, flags);
712 return &context[devfn];
713 }
714
715 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
716 {
717 struct root_entry *root;
718 struct context_entry *context;
719 int ret;
720 unsigned long flags;
721
722 spin_lock_irqsave(&iommu->lock, flags);
723 root = &iommu->root_entry[bus];
724 context = get_context_addr_from_root(root);
725 if (!context) {
726 ret = 0;
727 goto out;
728 }
729 ret = context_present(&context[devfn]);
730 out:
731 spin_unlock_irqrestore(&iommu->lock, flags);
732 return ret;
733 }
734
735 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
736 {
737 struct root_entry *root;
738 struct context_entry *context;
739 unsigned long flags;
740
741 spin_lock_irqsave(&iommu->lock, flags);
742 root = &iommu->root_entry[bus];
743 context = get_context_addr_from_root(root);
744 if (context) {
745 context_clear_entry(&context[devfn]);
746 __iommu_flush_cache(iommu, &context[devfn], \
747 sizeof(*context));
748 }
749 spin_unlock_irqrestore(&iommu->lock, flags);
750 }
751
752 static void free_context_table(struct intel_iommu *iommu)
753 {
754 struct root_entry *root;
755 int i;
756 unsigned long flags;
757 struct context_entry *context;
758
759 spin_lock_irqsave(&iommu->lock, flags);
760 if (!iommu->root_entry) {
761 goto out;
762 }
763 for (i = 0; i < ROOT_ENTRY_NR; i++) {
764 root = &iommu->root_entry[i];
765 context = get_context_addr_from_root(root);
766 if (context)
767 free_pgtable_page(context);
768 }
769 free_pgtable_page(iommu->root_entry);
770 iommu->root_entry = NULL;
771 out:
772 spin_unlock_irqrestore(&iommu->lock, flags);
773 }
774
775 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
776 unsigned long pfn, int target_level)
777 {
778 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
779 struct dma_pte *parent, *pte = NULL;
780 int level = agaw_to_level(domain->agaw);
781 int offset;
782
783 BUG_ON(!domain->pgd);
784 BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
785 parent = domain->pgd;
786
787 while (level > 0) {
788 void *tmp_page;
789
790 offset = pfn_level_offset(pfn, level);
791 pte = &parent[offset];
792 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
793 break;
794 if (level == target_level)
795 break;
796
797 if (!dma_pte_present(pte)) {
798 uint64_t pteval;
799
800 tmp_page = alloc_pgtable_page(domain->nid);
801
802 if (!tmp_page)
803 return NULL;
804
805 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
806 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
807 if (cmpxchg64(&pte->val, 0ULL, pteval)) {
808 /* Someone else set it while we were thinking; use theirs. */
809 free_pgtable_page(tmp_page);
810 } else {
811 dma_pte_addr(pte);
812 domain_flush_cache(domain, pte, sizeof(*pte));
813 }
814 }
815 parent = phys_to_virt(dma_pte_addr(pte));
816 level--;
817 }
818
819 return pte;
820 }
821
822
823 /* return address's pte at specific level */
824 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
825 unsigned long pfn,
826 int level, int *large_page)
827 {
828 struct dma_pte *parent, *pte = NULL;
829 int total = agaw_to_level(domain->agaw);
830 int offset;
831
832 parent = domain->pgd;
833 while (level <= total) {
834 offset = pfn_level_offset(pfn, total);
835 pte = &parent[offset];
836 if (level == total)
837 return pte;
838
839 if (!dma_pte_present(pte)) {
840 *large_page = total;
841 break;
842 }
843
844 if (pte->val & DMA_PTE_LARGE_PAGE) {
845 *large_page = total;
846 return pte;
847 }
848
849 parent = phys_to_virt(dma_pte_addr(pte));
850 total--;
851 }
852 return NULL;
853 }
854
855 /* clear last level pte, a tlb flush should be followed */
856 static int dma_pte_clear_range(struct dmar_domain *domain,
857 unsigned long start_pfn,
858 unsigned long last_pfn)
859 {
860 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
861 unsigned int large_page = 1;
862 struct dma_pte *first_pte, *pte;
863 int order;
864
865 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
866 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
867 BUG_ON(start_pfn > last_pfn);
868
869 /* we don't need lock here; nobody else touches the iova range */
870 do {
871 large_page = 1;
872 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
873 if (!pte) {
874 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
875 continue;
876 }
877 do {
878 dma_clear_pte(pte);
879 start_pfn += lvl_to_nr_pages(large_page);
880 pte++;
881 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
882
883 domain_flush_cache(domain, first_pte,
884 (void *)pte - (void *)first_pte);
885
886 } while (start_pfn && start_pfn <= last_pfn);
887
888 order = (large_page - 1) * 9;
889 return order;
890 }
891
892 /* free page table pages. last level pte should already be cleared */
893 static void dma_pte_free_pagetable(struct dmar_domain *domain,
894 unsigned long start_pfn,
895 unsigned long last_pfn)
896 {
897 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
898 struct dma_pte *first_pte, *pte;
899 int total = agaw_to_level(domain->agaw);
900 int level;
901 unsigned long tmp;
902 int large_page = 2;
903
904 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
905 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
906 BUG_ON(start_pfn > last_pfn);
907
908 /* We don't need lock here; nobody else touches the iova range */
909 level = 2;
910 while (level <= total) {
911 tmp = align_to_level(start_pfn, level);
912
913 /* If we can't even clear one PTE at this level, we're done */
914 if (tmp + level_size(level) - 1 > last_pfn)
915 return;
916
917 do {
918 large_page = level;
919 first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
920 if (large_page > level)
921 level = large_page + 1;
922 if (!pte) {
923 tmp = align_to_level(tmp + 1, level + 1);
924 continue;
925 }
926 do {
927 if (dma_pte_present(pte)) {
928 free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
929 dma_clear_pte(pte);
930 }
931 pte++;
932 tmp += level_size(level);
933 } while (!first_pte_in_page(pte) &&
934 tmp + level_size(level) - 1 <= last_pfn);
935
936 domain_flush_cache(domain, first_pte,
937 (void *)pte - (void *)first_pte);
938
939 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
940 level++;
941 }
942 /* free pgd */
943 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
944 free_pgtable_page(domain->pgd);
945 domain->pgd = NULL;
946 }
947 }
948
949 /* iommu handling */
950 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
951 {
952 struct root_entry *root;
953 unsigned long flags;
954
955 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
956 if (!root)
957 return -ENOMEM;
958
959 __iommu_flush_cache(iommu, root, ROOT_SIZE);
960
961 spin_lock_irqsave(&iommu->lock, flags);
962 iommu->root_entry = root;
963 spin_unlock_irqrestore(&iommu->lock, flags);
964
965 return 0;
966 }
967
968 static void iommu_set_root_entry(struct intel_iommu *iommu)
969 {
970 void *addr;
971 u32 sts;
972 unsigned long flag;
973
974 addr = iommu->root_entry;
975
976 raw_spin_lock_irqsave(&iommu->register_lock, flag);
977 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
978
979 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
980
981 /* Make sure hardware complete it */
982 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
983 readl, (sts & DMA_GSTS_RTPS), sts);
984
985 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
986 }
987
988 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
989 {
990 u32 val;
991 unsigned long flag;
992
993 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
994 return;
995
996 raw_spin_lock_irqsave(&iommu->register_lock, flag);
997 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
998
999 /* Make sure hardware complete it */
1000 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1001 readl, (!(val & DMA_GSTS_WBFS)), val);
1002
1003 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1004 }
1005
1006 /* return value determine if we need a write buffer flush */
1007 static void __iommu_flush_context(struct intel_iommu *iommu,
1008 u16 did, u16 source_id, u8 function_mask,
1009 u64 type)
1010 {
1011 u64 val = 0;
1012 unsigned long flag;
1013
1014 switch (type) {
1015 case DMA_CCMD_GLOBAL_INVL:
1016 val = DMA_CCMD_GLOBAL_INVL;
1017 break;
1018 case DMA_CCMD_DOMAIN_INVL:
1019 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1020 break;
1021 case DMA_CCMD_DEVICE_INVL:
1022 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1023 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1024 break;
1025 default:
1026 BUG();
1027 }
1028 val |= DMA_CCMD_ICC;
1029
1030 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1031 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1032
1033 /* Make sure hardware complete it */
1034 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1035 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1036
1037 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1038 }
1039
1040 /* return value determine if we need a write buffer flush */
1041 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1042 u64 addr, unsigned int size_order, u64 type)
1043 {
1044 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1045 u64 val = 0, val_iva = 0;
1046 unsigned long flag;
1047
1048 switch (type) {
1049 case DMA_TLB_GLOBAL_FLUSH:
1050 /* global flush doesn't need set IVA_REG */
1051 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1052 break;
1053 case DMA_TLB_DSI_FLUSH:
1054 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1055 break;
1056 case DMA_TLB_PSI_FLUSH:
1057 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1058 /* Note: always flush non-leaf currently */
1059 val_iva = size_order | addr;
1060 break;
1061 default:
1062 BUG();
1063 }
1064 /* Note: set drain read/write */
1065 #if 0
1066 /*
1067 * This is probably to be super secure.. Looks like we can
1068 * ignore it without any impact.
1069 */
1070 if (cap_read_drain(iommu->cap))
1071 val |= DMA_TLB_READ_DRAIN;
1072 #endif
1073 if (cap_write_drain(iommu->cap))
1074 val |= DMA_TLB_WRITE_DRAIN;
1075
1076 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1077 /* Note: Only uses first TLB reg currently */
1078 if (val_iva)
1079 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1080 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1081
1082 /* Make sure hardware complete it */
1083 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1084 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1085
1086 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1087
1088 /* check IOTLB invalidation granularity */
1089 if (DMA_TLB_IAIG(val) == 0)
1090 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1091 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1092 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1093 (unsigned long long)DMA_TLB_IIRG(type),
1094 (unsigned long long)DMA_TLB_IAIG(val));
1095 }
1096
1097 static struct device_domain_info *iommu_support_dev_iotlb(
1098 struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1099 {
1100 int found = 0;
1101 unsigned long flags;
1102 struct device_domain_info *info;
1103 struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1104
1105 if (!ecap_dev_iotlb_support(iommu->ecap))
1106 return NULL;
1107
1108 if (!iommu->qi)
1109 return NULL;
1110
1111 spin_lock_irqsave(&device_domain_lock, flags);
1112 list_for_each_entry(info, &domain->devices, link)
1113 if (info->bus == bus && info->devfn == devfn) {
1114 found = 1;
1115 break;
1116 }
1117 spin_unlock_irqrestore(&device_domain_lock, flags);
1118
1119 if (!found || !info->dev)
1120 return NULL;
1121
1122 if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1123 return NULL;
1124
1125 if (!dmar_find_matched_atsr_unit(info->dev))
1126 return NULL;
1127
1128 info->iommu = iommu;
1129
1130 return info;
1131 }
1132
1133 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1134 {
1135 if (!info)
1136 return;
1137
1138 pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1139 }
1140
1141 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1142 {
1143 if (!info->dev || !pci_ats_enabled(info->dev))
1144 return;
1145
1146 pci_disable_ats(info->dev);
1147 }
1148
1149 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1150 u64 addr, unsigned mask)
1151 {
1152 u16 sid, qdep;
1153 unsigned long flags;
1154 struct device_domain_info *info;
1155
1156 spin_lock_irqsave(&device_domain_lock, flags);
1157 list_for_each_entry(info, &domain->devices, link) {
1158 if (!info->dev || !pci_ats_enabled(info->dev))
1159 continue;
1160
1161 sid = info->bus << 8 | info->devfn;
1162 qdep = pci_ats_queue_depth(info->dev);
1163 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1164 }
1165 spin_unlock_irqrestore(&device_domain_lock, flags);
1166 }
1167
1168 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1169 unsigned long pfn, unsigned int pages, int map)
1170 {
1171 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1172 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1173
1174 BUG_ON(pages == 0);
1175
1176 /*
1177 * Fallback to domain selective flush if no PSI support or the size is
1178 * too big.
1179 * PSI requires page size to be 2 ^ x, and the base address is naturally
1180 * aligned to the size
1181 */
1182 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1183 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1184 DMA_TLB_DSI_FLUSH);
1185 else
1186 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1187 DMA_TLB_PSI_FLUSH);
1188
1189 /*
1190 * In caching mode, changes of pages from non-present to present require
1191 * flush. However, device IOTLB doesn't need to be flushed in this case.
1192 */
1193 if (!cap_caching_mode(iommu->cap) || !map)
1194 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1195 }
1196
1197 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1198 {
1199 u32 pmen;
1200 unsigned long flags;
1201
1202 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1203 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1204 pmen &= ~DMA_PMEN_EPM;
1205 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1206
1207 /* wait for the protected region status bit to clear */
1208 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1209 readl, !(pmen & DMA_PMEN_PRS), pmen);
1210
1211 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1212 }
1213
1214 static int iommu_enable_translation(struct intel_iommu *iommu)
1215 {
1216 u32 sts;
1217 unsigned long flags;
1218
1219 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1220 iommu->gcmd |= DMA_GCMD_TE;
1221 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1222
1223 /* Make sure hardware complete it */
1224 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1225 readl, (sts & DMA_GSTS_TES), sts);
1226
1227 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1228 return 0;
1229 }
1230
1231 static int iommu_disable_translation(struct intel_iommu *iommu)
1232 {
1233 u32 sts;
1234 unsigned long flag;
1235
1236 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1237 iommu->gcmd &= ~DMA_GCMD_TE;
1238 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1239
1240 /* Make sure hardware complete it */
1241 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1242 readl, (!(sts & DMA_GSTS_TES)), sts);
1243
1244 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1245 return 0;
1246 }
1247
1248
1249 static int iommu_init_domains(struct intel_iommu *iommu)
1250 {
1251 unsigned long ndomains;
1252 unsigned long nlongs;
1253
1254 ndomains = cap_ndoms(iommu->cap);
1255 pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1256 ndomains);
1257 nlongs = BITS_TO_LONGS(ndomains);
1258
1259 spin_lock_init(&iommu->lock);
1260
1261 /* TBD: there might be 64K domains,
1262 * consider other allocation for future chip
1263 */
1264 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1265 if (!iommu->domain_ids) {
1266 printk(KERN_ERR "Allocating domain id array failed\n");
1267 return -ENOMEM;
1268 }
1269 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1270 GFP_KERNEL);
1271 if (!iommu->domains) {
1272 printk(KERN_ERR "Allocating domain array failed\n");
1273 return -ENOMEM;
1274 }
1275
1276 /*
1277 * if Caching mode is set, then invalid translations are tagged
1278 * with domainid 0. Hence we need to pre-allocate it.
1279 */
1280 if (cap_caching_mode(iommu->cap))
1281 set_bit(0, iommu->domain_ids);
1282 return 0;
1283 }
1284
1285
1286 static void domain_exit(struct dmar_domain *domain);
1287 static void vm_domain_exit(struct dmar_domain *domain);
1288
1289 void free_dmar_iommu(struct intel_iommu *iommu)
1290 {
1291 struct dmar_domain *domain;
1292 int i;
1293 unsigned long flags;
1294
1295 if ((iommu->domains) && (iommu->domain_ids)) {
1296 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1297 domain = iommu->domains[i];
1298 clear_bit(i, iommu->domain_ids);
1299
1300 spin_lock_irqsave(&domain->iommu_lock, flags);
1301 if (--domain->iommu_count == 0) {
1302 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1303 vm_domain_exit(domain);
1304 else
1305 domain_exit(domain);
1306 }
1307 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1308 }
1309 }
1310
1311 if (iommu->gcmd & DMA_GCMD_TE)
1312 iommu_disable_translation(iommu);
1313
1314 if (iommu->irq) {
1315 irq_set_handler_data(iommu->irq, NULL);
1316 /* This will mask the irq */
1317 free_irq(iommu->irq, iommu);
1318 destroy_irq(iommu->irq);
1319 }
1320
1321 kfree(iommu->domains);
1322 kfree(iommu->domain_ids);
1323
1324 g_iommus[iommu->seq_id] = NULL;
1325
1326 /* if all iommus are freed, free g_iommus */
1327 for (i = 0; i < g_num_of_iommus; i++) {
1328 if (g_iommus[i])
1329 break;
1330 }
1331
1332 if (i == g_num_of_iommus)
1333 kfree(g_iommus);
1334
1335 /* free context mapping */
1336 free_context_table(iommu);
1337 }
1338
1339 static struct dmar_domain *alloc_domain(void)
1340 {
1341 struct dmar_domain *domain;
1342
1343 domain = alloc_domain_mem();
1344 if (!domain)
1345 return NULL;
1346
1347 domain->nid = -1;
1348 memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1349 domain->flags = 0;
1350
1351 return domain;
1352 }
1353
1354 static int iommu_attach_domain(struct dmar_domain *domain,
1355 struct intel_iommu *iommu)
1356 {
1357 int num;
1358 unsigned long ndomains;
1359 unsigned long flags;
1360
1361 ndomains = cap_ndoms(iommu->cap);
1362
1363 spin_lock_irqsave(&iommu->lock, flags);
1364
1365 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1366 if (num >= ndomains) {
1367 spin_unlock_irqrestore(&iommu->lock, flags);
1368 printk(KERN_ERR "IOMMU: no free domain ids\n");
1369 return -ENOMEM;
1370 }
1371
1372 domain->id = num;
1373 set_bit(num, iommu->domain_ids);
1374 set_bit(iommu->seq_id, domain->iommu_bmp);
1375 iommu->domains[num] = domain;
1376 spin_unlock_irqrestore(&iommu->lock, flags);
1377
1378 return 0;
1379 }
1380
1381 static void iommu_detach_domain(struct dmar_domain *domain,
1382 struct intel_iommu *iommu)
1383 {
1384 unsigned long flags;
1385 int num, ndomains;
1386 int found = 0;
1387
1388 spin_lock_irqsave(&iommu->lock, flags);
1389 ndomains = cap_ndoms(iommu->cap);
1390 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1391 if (iommu->domains[num] == domain) {
1392 found = 1;
1393 break;
1394 }
1395 }
1396
1397 if (found) {
1398 clear_bit(num, iommu->domain_ids);
1399 clear_bit(iommu->seq_id, domain->iommu_bmp);
1400 iommu->domains[num] = NULL;
1401 }
1402 spin_unlock_irqrestore(&iommu->lock, flags);
1403 }
1404
1405 static struct iova_domain reserved_iova_list;
1406 static struct lock_class_key reserved_rbtree_key;
1407
1408 static int dmar_init_reserved_ranges(void)
1409 {
1410 struct pci_dev *pdev = NULL;
1411 struct iova *iova;
1412 int i;
1413
1414 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1415
1416 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1417 &reserved_rbtree_key);
1418
1419 /* IOAPIC ranges shouldn't be accessed by DMA */
1420 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1421 IOVA_PFN(IOAPIC_RANGE_END));
1422 if (!iova) {
1423 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1424 return -ENODEV;
1425 }
1426
1427 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1428 for_each_pci_dev(pdev) {
1429 struct resource *r;
1430
1431 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1432 r = &pdev->resource[i];
1433 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1434 continue;
1435 iova = reserve_iova(&reserved_iova_list,
1436 IOVA_PFN(r->start),
1437 IOVA_PFN(r->end));
1438 if (!iova) {
1439 printk(KERN_ERR "Reserve iova failed\n");
1440 return -ENODEV;
1441 }
1442 }
1443 }
1444 return 0;
1445 }
1446
1447 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1448 {
1449 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1450 }
1451
1452 static inline int guestwidth_to_adjustwidth(int gaw)
1453 {
1454 int agaw;
1455 int r = (gaw - 12) % 9;
1456
1457 if (r == 0)
1458 agaw = gaw;
1459 else
1460 agaw = gaw + 9 - r;
1461 if (agaw > 64)
1462 agaw = 64;
1463 return agaw;
1464 }
1465
1466 static int domain_init(struct dmar_domain *domain, int guest_width)
1467 {
1468 struct intel_iommu *iommu;
1469 int adjust_width, agaw;
1470 unsigned long sagaw;
1471
1472 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1473 spin_lock_init(&domain->iommu_lock);
1474
1475 domain_reserve_special_ranges(domain);
1476
1477 /* calculate AGAW */
1478 iommu = domain_get_iommu(domain);
1479 if (guest_width > cap_mgaw(iommu->cap))
1480 guest_width = cap_mgaw(iommu->cap);
1481 domain->gaw = guest_width;
1482 adjust_width = guestwidth_to_adjustwidth(guest_width);
1483 agaw = width_to_agaw(adjust_width);
1484 sagaw = cap_sagaw(iommu->cap);
1485 if (!test_bit(agaw, &sagaw)) {
1486 /* hardware doesn't support it, choose a bigger one */
1487 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1488 agaw = find_next_bit(&sagaw, 5, agaw);
1489 if (agaw >= 5)
1490 return -ENODEV;
1491 }
1492 domain->agaw = agaw;
1493 INIT_LIST_HEAD(&domain->devices);
1494
1495 if (ecap_coherent(iommu->ecap))
1496 domain->iommu_coherency = 1;
1497 else
1498 domain->iommu_coherency = 0;
1499
1500 if (ecap_sc_support(iommu->ecap))
1501 domain->iommu_snooping = 1;
1502 else
1503 domain->iommu_snooping = 0;
1504
1505 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1506 domain->iommu_count = 1;
1507 domain->nid = iommu->node;
1508
1509 /* always allocate the top pgd */
1510 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1511 if (!domain->pgd)
1512 return -ENOMEM;
1513 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1514 return 0;
1515 }
1516
1517 static void domain_exit(struct dmar_domain *domain)
1518 {
1519 struct dmar_drhd_unit *drhd;
1520 struct intel_iommu *iommu;
1521
1522 /* Domain 0 is reserved, so dont process it */
1523 if (!domain)
1524 return;
1525
1526 /* Flush any lazy unmaps that may reference this domain */
1527 if (!intel_iommu_strict)
1528 flush_unmaps_timeout(0);
1529
1530 domain_remove_dev_info(domain);
1531 /* destroy iovas */
1532 put_iova_domain(&domain->iovad);
1533
1534 /* clear ptes */
1535 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1536
1537 /* free page tables */
1538 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1539
1540 for_each_active_iommu(iommu, drhd)
1541 if (test_bit(iommu->seq_id, domain->iommu_bmp))
1542 iommu_detach_domain(domain, iommu);
1543
1544 free_domain_mem(domain);
1545 }
1546
1547 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1548 u8 bus, u8 devfn, int translation)
1549 {
1550 struct context_entry *context;
1551 unsigned long flags;
1552 struct intel_iommu *iommu;
1553 struct dma_pte *pgd;
1554 unsigned long num;
1555 unsigned long ndomains;
1556 int id;
1557 int agaw;
1558 struct device_domain_info *info = NULL;
1559
1560 pr_debug("Set context mapping for %02x:%02x.%d\n",
1561 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1562
1563 BUG_ON(!domain->pgd);
1564 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1565 translation != CONTEXT_TT_MULTI_LEVEL);
1566
1567 iommu = device_to_iommu(segment, bus, devfn);
1568 if (!iommu)
1569 return -ENODEV;
1570
1571 context = device_to_context_entry(iommu, bus, devfn);
1572 if (!context)
1573 return -ENOMEM;
1574 spin_lock_irqsave(&iommu->lock, flags);
1575 if (context_present(context)) {
1576 spin_unlock_irqrestore(&iommu->lock, flags);
1577 return 0;
1578 }
1579
1580 id = domain->id;
1581 pgd = domain->pgd;
1582
1583 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1584 domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1585 int found = 0;
1586
1587 /* find an available domain id for this device in iommu */
1588 ndomains = cap_ndoms(iommu->cap);
1589 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1590 if (iommu->domains[num] == domain) {
1591 id = num;
1592 found = 1;
1593 break;
1594 }
1595 }
1596
1597 if (found == 0) {
1598 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1599 if (num >= ndomains) {
1600 spin_unlock_irqrestore(&iommu->lock, flags);
1601 printk(KERN_ERR "IOMMU: no free domain ids\n");
1602 return -EFAULT;
1603 }
1604
1605 set_bit(num, iommu->domain_ids);
1606 iommu->domains[num] = domain;
1607 id = num;
1608 }
1609
1610 /* Skip top levels of page tables for
1611 * iommu which has less agaw than default.
1612 * Unnecessary for PT mode.
1613 */
1614 if (translation != CONTEXT_TT_PASS_THROUGH) {
1615 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1616 pgd = phys_to_virt(dma_pte_addr(pgd));
1617 if (!dma_pte_present(pgd)) {
1618 spin_unlock_irqrestore(&iommu->lock, flags);
1619 return -ENOMEM;
1620 }
1621 }
1622 }
1623 }
1624
1625 context_set_domain_id(context, id);
1626
1627 if (translation != CONTEXT_TT_PASS_THROUGH) {
1628 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1629 translation = info ? CONTEXT_TT_DEV_IOTLB :
1630 CONTEXT_TT_MULTI_LEVEL;
1631 }
1632 /*
1633 * In pass through mode, AW must be programmed to indicate the largest
1634 * AGAW value supported by hardware. And ASR is ignored by hardware.
1635 */
1636 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1637 context_set_address_width(context, iommu->msagaw);
1638 else {
1639 context_set_address_root(context, virt_to_phys(pgd));
1640 context_set_address_width(context, iommu->agaw);
1641 }
1642
1643 context_set_translation_type(context, translation);
1644 context_set_fault_enable(context);
1645 context_set_present(context);
1646 domain_flush_cache(domain, context, sizeof(*context));
1647
1648 /*
1649 * It's a non-present to present mapping. If hardware doesn't cache
1650 * non-present entry we only need to flush the write-buffer. If the
1651 * _does_ cache non-present entries, then it does so in the special
1652 * domain #0, which we have to flush:
1653 */
1654 if (cap_caching_mode(iommu->cap)) {
1655 iommu->flush.flush_context(iommu, 0,
1656 (((u16)bus) << 8) | devfn,
1657 DMA_CCMD_MASK_NOBIT,
1658 DMA_CCMD_DEVICE_INVL);
1659 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1660 } else {
1661 iommu_flush_write_buffer(iommu);
1662 }
1663 iommu_enable_dev_iotlb(info);
1664 spin_unlock_irqrestore(&iommu->lock, flags);
1665
1666 spin_lock_irqsave(&domain->iommu_lock, flags);
1667 if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1668 domain->iommu_count++;
1669 if (domain->iommu_count == 1)
1670 domain->nid = iommu->node;
1671 domain_update_iommu_cap(domain);
1672 }
1673 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1674 return 0;
1675 }
1676
1677 static int
1678 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1679 int translation)
1680 {
1681 int ret;
1682 struct pci_dev *tmp, *parent;
1683
1684 ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1685 pdev->bus->number, pdev->devfn,
1686 translation);
1687 if (ret)
1688 return ret;
1689
1690 /* dependent device mapping */
1691 tmp = pci_find_upstream_pcie_bridge(pdev);
1692 if (!tmp)
1693 return 0;
1694 /* Secondary interface's bus number and devfn 0 */
1695 parent = pdev->bus->self;
1696 while (parent != tmp) {
1697 ret = domain_context_mapping_one(domain,
1698 pci_domain_nr(parent->bus),
1699 parent->bus->number,
1700 parent->devfn, translation);
1701 if (ret)
1702 return ret;
1703 parent = parent->bus->self;
1704 }
1705 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1706 return domain_context_mapping_one(domain,
1707 pci_domain_nr(tmp->subordinate),
1708 tmp->subordinate->number, 0,
1709 translation);
1710 else /* this is a legacy PCI bridge */
1711 return domain_context_mapping_one(domain,
1712 pci_domain_nr(tmp->bus),
1713 tmp->bus->number,
1714 tmp->devfn,
1715 translation);
1716 }
1717
1718 static int domain_context_mapped(struct pci_dev *pdev)
1719 {
1720 int ret;
1721 struct pci_dev *tmp, *parent;
1722 struct intel_iommu *iommu;
1723
1724 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1725 pdev->devfn);
1726 if (!iommu)
1727 return -ENODEV;
1728
1729 ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1730 if (!ret)
1731 return ret;
1732 /* dependent device mapping */
1733 tmp = pci_find_upstream_pcie_bridge(pdev);
1734 if (!tmp)
1735 return ret;
1736 /* Secondary interface's bus number and devfn 0 */
1737 parent = pdev->bus->self;
1738 while (parent != tmp) {
1739 ret = device_context_mapped(iommu, parent->bus->number,
1740 parent->devfn);
1741 if (!ret)
1742 return ret;
1743 parent = parent->bus->self;
1744 }
1745 if (pci_is_pcie(tmp))
1746 return device_context_mapped(iommu, tmp->subordinate->number,
1747 0);
1748 else
1749 return device_context_mapped(iommu, tmp->bus->number,
1750 tmp->devfn);
1751 }
1752
1753 /* Returns a number of VTD pages, but aligned to MM page size */
1754 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1755 size_t size)
1756 {
1757 host_addr &= ~PAGE_MASK;
1758 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1759 }
1760
1761 /* Return largest possible superpage level for a given mapping */
1762 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1763 unsigned long iov_pfn,
1764 unsigned long phy_pfn,
1765 unsigned long pages)
1766 {
1767 int support, level = 1;
1768 unsigned long pfnmerge;
1769
1770 support = domain->iommu_superpage;
1771
1772 /* To use a large page, the virtual *and* physical addresses
1773 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1774 of them will mean we have to use smaller pages. So just
1775 merge them and check both at once. */
1776 pfnmerge = iov_pfn | phy_pfn;
1777
1778 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1779 pages >>= VTD_STRIDE_SHIFT;
1780 if (!pages)
1781 break;
1782 pfnmerge >>= VTD_STRIDE_SHIFT;
1783 level++;
1784 support--;
1785 }
1786 return level;
1787 }
1788
1789 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1790 struct scatterlist *sg, unsigned long phys_pfn,
1791 unsigned long nr_pages, int prot)
1792 {
1793 struct dma_pte *first_pte = NULL, *pte = NULL;
1794 phys_addr_t uninitialized_var(pteval);
1795 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1796 unsigned long sg_res;
1797 unsigned int largepage_lvl = 0;
1798 unsigned long lvl_pages = 0;
1799
1800 BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1801
1802 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1803 return -EINVAL;
1804
1805 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1806
1807 if (sg)
1808 sg_res = 0;
1809 else {
1810 sg_res = nr_pages + 1;
1811 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1812 }
1813
1814 while (nr_pages > 0) {
1815 uint64_t tmp;
1816
1817 if (!sg_res) {
1818 sg_res = aligned_nrpages(sg->offset, sg->length);
1819 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1820 sg->dma_length = sg->length;
1821 pteval = page_to_phys(sg_page(sg)) | prot;
1822 phys_pfn = pteval >> VTD_PAGE_SHIFT;
1823 }
1824
1825 if (!pte) {
1826 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1827
1828 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1829 if (!pte)
1830 return -ENOMEM;
1831 /* It is large page*/
1832 if (largepage_lvl > 1) {
1833 pteval |= DMA_PTE_LARGE_PAGE;
1834 /* Ensure that old small page tables are removed to make room
1835 for superpage, if they exist. */
1836 dma_pte_clear_range(domain, iov_pfn,
1837 iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1838 dma_pte_free_pagetable(domain, iov_pfn,
1839 iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1840 } else {
1841 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1842 }
1843
1844 }
1845 /* We don't need lock here, nobody else
1846 * touches the iova range
1847 */
1848 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1849 if (tmp) {
1850 static int dumps = 5;
1851 printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1852 iov_pfn, tmp, (unsigned long long)pteval);
1853 if (dumps) {
1854 dumps--;
1855 debug_dma_dump_mappings(NULL);
1856 }
1857 WARN_ON(1);
1858 }
1859
1860 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1861
1862 BUG_ON(nr_pages < lvl_pages);
1863 BUG_ON(sg_res < lvl_pages);
1864
1865 nr_pages -= lvl_pages;
1866 iov_pfn += lvl_pages;
1867 phys_pfn += lvl_pages;
1868 pteval += lvl_pages * VTD_PAGE_SIZE;
1869 sg_res -= lvl_pages;
1870
1871 /* If the next PTE would be the first in a new page, then we
1872 need to flush the cache on the entries we've just written.
1873 And then we'll need to recalculate 'pte', so clear it and
1874 let it get set again in the if (!pte) block above.
1875
1876 If we're done (!nr_pages) we need to flush the cache too.
1877
1878 Also if we've been setting superpages, we may need to
1879 recalculate 'pte' and switch back to smaller pages for the
1880 end of the mapping, if the trailing size is not enough to
1881 use another superpage (i.e. sg_res < lvl_pages). */
1882 pte++;
1883 if (!nr_pages || first_pte_in_page(pte) ||
1884 (largepage_lvl > 1 && sg_res < lvl_pages)) {
1885 domain_flush_cache(domain, first_pte,
1886 (void *)pte - (void *)first_pte);
1887 pte = NULL;
1888 }
1889
1890 if (!sg_res && nr_pages)
1891 sg = sg_next(sg);
1892 }
1893 return 0;
1894 }
1895
1896 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1897 struct scatterlist *sg, unsigned long nr_pages,
1898 int prot)
1899 {
1900 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1901 }
1902
1903 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1904 unsigned long phys_pfn, unsigned long nr_pages,
1905 int prot)
1906 {
1907 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1908 }
1909
1910 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1911 {
1912 if (!iommu)
1913 return;
1914
1915 clear_context_table(iommu, bus, devfn);
1916 iommu->flush.flush_context(iommu, 0, 0, 0,
1917 DMA_CCMD_GLOBAL_INVL);
1918 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1919 }
1920
1921 static inline void unlink_domain_info(struct device_domain_info *info)
1922 {
1923 assert_spin_locked(&device_domain_lock);
1924 list_del(&info->link);
1925 list_del(&info->global);
1926 if (info->dev)
1927 info->dev->dev.archdata.iommu = NULL;
1928 }
1929
1930 static void domain_remove_dev_info(struct dmar_domain *domain)
1931 {
1932 struct device_domain_info *info;
1933 unsigned long flags;
1934 struct intel_iommu *iommu;
1935
1936 spin_lock_irqsave(&device_domain_lock, flags);
1937 while (!list_empty(&domain->devices)) {
1938 info = list_entry(domain->devices.next,
1939 struct device_domain_info, link);
1940 unlink_domain_info(info);
1941 spin_unlock_irqrestore(&device_domain_lock, flags);
1942
1943 iommu_disable_dev_iotlb(info);
1944 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1945 iommu_detach_dev(iommu, info->bus, info->devfn);
1946 free_devinfo_mem(info);
1947
1948 spin_lock_irqsave(&device_domain_lock, flags);
1949 }
1950 spin_unlock_irqrestore(&device_domain_lock, flags);
1951 }
1952
1953 /*
1954 * find_domain
1955 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1956 */
1957 static struct dmar_domain *
1958 find_domain(struct pci_dev *pdev)
1959 {
1960 struct device_domain_info *info;
1961
1962 /* No lock here, assumes no domain exit in normal case */
1963 info = pdev->dev.archdata.iommu;
1964 if (info)
1965 return info->domain;
1966 return NULL;
1967 }
1968
1969 /* domain is initialized */
1970 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1971 {
1972 struct dmar_domain *domain, *found = NULL;
1973 struct intel_iommu *iommu;
1974 struct dmar_drhd_unit *drhd;
1975 struct device_domain_info *info, *tmp;
1976 struct pci_dev *dev_tmp;
1977 unsigned long flags;
1978 int bus = 0, devfn = 0;
1979 int segment;
1980 int ret;
1981
1982 domain = find_domain(pdev);
1983 if (domain)
1984 return domain;
1985
1986 segment = pci_domain_nr(pdev->bus);
1987
1988 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1989 if (dev_tmp) {
1990 if (pci_is_pcie(dev_tmp)) {
1991 bus = dev_tmp->subordinate->number;
1992 devfn = 0;
1993 } else {
1994 bus = dev_tmp->bus->number;
1995 devfn = dev_tmp->devfn;
1996 }
1997 spin_lock_irqsave(&device_domain_lock, flags);
1998 list_for_each_entry(info, &device_domain_list, global) {
1999 if (info->segment == segment &&
2000 info->bus == bus && info->devfn == devfn) {
2001 found = info->domain;
2002 break;
2003 }
2004 }
2005 spin_unlock_irqrestore(&device_domain_lock, flags);
2006 /* pcie-pci bridge already has a domain, uses it */
2007 if (found) {
2008 domain = found;
2009 goto found_domain;
2010 }
2011 }
2012
2013 domain = alloc_domain();
2014 if (!domain)
2015 goto error;
2016
2017 /* Allocate new domain for the device */
2018 drhd = dmar_find_matched_drhd_unit(pdev);
2019 if (!drhd) {
2020 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2021 pci_name(pdev));
2022 free_domain_mem(domain);
2023 return NULL;
2024 }
2025 iommu = drhd->iommu;
2026
2027 ret = iommu_attach_domain(domain, iommu);
2028 if (ret) {
2029 free_domain_mem(domain);
2030 goto error;
2031 }
2032
2033 if (domain_init(domain, gaw)) {
2034 domain_exit(domain);
2035 goto error;
2036 }
2037
2038 /* register pcie-to-pci device */
2039 if (dev_tmp) {
2040 info = alloc_devinfo_mem();
2041 if (!info) {
2042 domain_exit(domain);
2043 goto error;
2044 }
2045 info->segment = segment;
2046 info->bus = bus;
2047 info->devfn = devfn;
2048 info->dev = NULL;
2049 info->domain = domain;
2050 /* This domain is shared by devices under p2p bridge */
2051 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2052
2053 /* pcie-to-pci bridge already has a domain, uses it */
2054 found = NULL;
2055 spin_lock_irqsave(&device_domain_lock, flags);
2056 list_for_each_entry(tmp, &device_domain_list, global) {
2057 if (tmp->segment == segment &&
2058 tmp->bus == bus && tmp->devfn == devfn) {
2059 found = tmp->domain;
2060 break;
2061 }
2062 }
2063 if (found) {
2064 spin_unlock_irqrestore(&device_domain_lock, flags);
2065 free_devinfo_mem(info);
2066 domain_exit(domain);
2067 domain = found;
2068 } else {
2069 list_add(&info->link, &domain->devices);
2070 list_add(&info->global, &device_domain_list);
2071 spin_unlock_irqrestore(&device_domain_lock, flags);
2072 }
2073 }
2074
2075 found_domain:
2076 info = alloc_devinfo_mem();
2077 if (!info)
2078 goto error;
2079 info->segment = segment;
2080 info->bus = pdev->bus->number;
2081 info->devfn = pdev->devfn;
2082 info->dev = pdev;
2083 info->domain = domain;
2084 spin_lock_irqsave(&device_domain_lock, flags);
2085 /* somebody is fast */
2086 found = find_domain(pdev);
2087 if (found != NULL) {
2088 spin_unlock_irqrestore(&device_domain_lock, flags);
2089 if (found != domain) {
2090 domain_exit(domain);
2091 domain = found;
2092 }
2093 free_devinfo_mem(info);
2094 return domain;
2095 }
2096 list_add(&info->link, &domain->devices);
2097 list_add(&info->global, &device_domain_list);
2098 pdev->dev.archdata.iommu = info;
2099 spin_unlock_irqrestore(&device_domain_lock, flags);
2100 return domain;
2101 error:
2102 /* recheck it here, maybe others set it */
2103 return find_domain(pdev);
2104 }
2105
2106 static int iommu_identity_mapping;
2107 #define IDENTMAP_ALL 1
2108 #define IDENTMAP_GFX 2
2109 #define IDENTMAP_AZALIA 4
2110
2111 static int iommu_domain_identity_map(struct dmar_domain *domain,
2112 unsigned long long start,
2113 unsigned long long end)
2114 {
2115 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2116 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2117
2118 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2119 dma_to_mm_pfn(last_vpfn))) {
2120 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2121 return -ENOMEM;
2122 }
2123
2124 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2125 start, end, domain->id);
2126 /*
2127 * RMRR range might have overlap with physical memory range,
2128 * clear it first
2129 */
2130 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2131
2132 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2133 last_vpfn - first_vpfn + 1,
2134 DMA_PTE_READ|DMA_PTE_WRITE);
2135 }
2136
2137 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2138 unsigned long long start,
2139 unsigned long long end)
2140 {
2141 struct dmar_domain *domain;
2142 int ret;
2143
2144 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2145 if (!domain)
2146 return -ENOMEM;
2147
2148 /* For _hardware_ passthrough, don't bother. But for software
2149 passthrough, we do it anyway -- it may indicate a memory
2150 range which is reserved in E820, so which didn't get set
2151 up to start with in si_domain */
2152 if (domain == si_domain && hw_pass_through) {
2153 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2154 pci_name(pdev), start, end);
2155 return 0;
2156 }
2157
2158 printk(KERN_INFO
2159 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2160 pci_name(pdev), start, end);
2161
2162 if (end < start) {
2163 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2164 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2165 dmi_get_system_info(DMI_BIOS_VENDOR),
2166 dmi_get_system_info(DMI_BIOS_VERSION),
2167 dmi_get_system_info(DMI_PRODUCT_VERSION));
2168 ret = -EIO;
2169 goto error;
2170 }
2171
2172 if (end >> agaw_to_width(domain->agaw)) {
2173 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2174 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2175 agaw_to_width(domain->agaw),
2176 dmi_get_system_info(DMI_BIOS_VENDOR),
2177 dmi_get_system_info(DMI_BIOS_VERSION),
2178 dmi_get_system_info(DMI_PRODUCT_VERSION));
2179 ret = -EIO;
2180 goto error;
2181 }
2182
2183 ret = iommu_domain_identity_map(domain, start, end);
2184 if (ret)
2185 goto error;
2186
2187 /* context entry init */
2188 ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2189 if (ret)
2190 goto error;
2191
2192 return 0;
2193
2194 error:
2195 domain_exit(domain);
2196 return ret;
2197 }
2198
2199 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2200 struct pci_dev *pdev)
2201 {
2202 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2203 return 0;
2204 return iommu_prepare_identity_map(pdev, rmrr->base_address,
2205 rmrr->end_address);
2206 }
2207
2208 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2209 static inline void iommu_prepare_isa(void)
2210 {
2211 struct pci_dev *pdev;
2212 int ret;
2213
2214 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2215 if (!pdev)
2216 return;
2217
2218 printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2219 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2220
2221 if (ret)
2222 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2223 "floppy might not work\n");
2224
2225 }
2226 #else
2227 static inline void iommu_prepare_isa(void)
2228 {
2229 return;
2230 }
2231 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2232
2233 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2234
2235 static int __init si_domain_init(int hw)
2236 {
2237 struct dmar_drhd_unit *drhd;
2238 struct intel_iommu *iommu;
2239 int nid, ret = 0;
2240
2241 si_domain = alloc_domain();
2242 if (!si_domain)
2243 return -EFAULT;
2244
2245 pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2246
2247 for_each_active_iommu(iommu, drhd) {
2248 ret = iommu_attach_domain(si_domain, iommu);
2249 if (ret) {
2250 domain_exit(si_domain);
2251 return -EFAULT;
2252 }
2253 }
2254
2255 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2256 domain_exit(si_domain);
2257 return -EFAULT;
2258 }
2259
2260 si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2261
2262 if (hw)
2263 return 0;
2264
2265 for_each_online_node(nid) {
2266 unsigned long start_pfn, end_pfn;
2267 int i;
2268
2269 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2270 ret = iommu_domain_identity_map(si_domain,
2271 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2272 if (ret)
2273 return ret;
2274 }
2275 }
2276
2277 return 0;
2278 }
2279
2280 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2281 struct pci_dev *pdev);
2282 static int identity_mapping(struct pci_dev *pdev)
2283 {
2284 struct device_domain_info *info;
2285
2286 if (likely(!iommu_identity_mapping))
2287 return 0;
2288
2289 info = pdev->dev.archdata.iommu;
2290 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2291 return (info->domain == si_domain);
2292
2293 return 0;
2294 }
2295
2296 static int domain_add_dev_info(struct dmar_domain *domain,
2297 struct pci_dev *pdev,
2298 int translation)
2299 {
2300 struct device_domain_info *info;
2301 unsigned long flags;
2302 int ret;
2303
2304 info = alloc_devinfo_mem();
2305 if (!info)
2306 return -ENOMEM;
2307
2308 info->segment = pci_domain_nr(pdev->bus);
2309 info->bus = pdev->bus->number;
2310 info->devfn = pdev->devfn;
2311 info->dev = pdev;
2312 info->domain = domain;
2313
2314 spin_lock_irqsave(&device_domain_lock, flags);
2315 list_add(&info->link, &domain->devices);
2316 list_add(&info->global, &device_domain_list);
2317 pdev->dev.archdata.iommu = info;
2318 spin_unlock_irqrestore(&device_domain_lock, flags);
2319
2320 ret = domain_context_mapping(domain, pdev, translation);
2321 if (ret) {
2322 spin_lock_irqsave(&device_domain_lock, flags);
2323 unlink_domain_info(info);
2324 spin_unlock_irqrestore(&device_domain_lock, flags);
2325 free_devinfo_mem(info);
2326 return ret;
2327 }
2328
2329 return 0;
2330 }
2331
2332 static bool device_has_rmrr(struct pci_dev *dev)
2333 {
2334 struct dmar_rmrr_unit *rmrr;
2335 int i;
2336
2337 for_each_rmrr_units(rmrr) {
2338 for (i = 0; i < rmrr->devices_cnt; i++) {
2339 /*
2340 * Return TRUE if this RMRR contains the device that
2341 * is passed in.
2342 */
2343 if (rmrr->devices[i] == dev)
2344 return true;
2345 }
2346 }
2347 return false;
2348 }
2349
2350 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2351 {
2352
2353 /*
2354 * We want to prevent any device associated with an RMRR from
2355 * getting placed into the SI Domain. This is done because
2356 * problems exist when devices are moved in and out of domains
2357 * and their respective RMRR info is lost. We exempt USB devices
2358 * from this process due to their usage of RMRRs that are known
2359 * to not be needed after BIOS hand-off to OS.
2360 */
2361 if (device_has_rmrr(pdev) &&
2362 (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2363 return 0;
2364
2365 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2366 return 1;
2367
2368 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2369 return 1;
2370
2371 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2372 return 0;
2373
2374 /*
2375 * We want to start off with all devices in the 1:1 domain, and
2376 * take them out later if we find they can't access all of memory.
2377 *
2378 * However, we can't do this for PCI devices behind bridges,
2379 * because all PCI devices behind the same bridge will end up
2380 * with the same source-id on their transactions.
2381 *
2382 * Practically speaking, we can't change things around for these
2383 * devices at run-time, because we can't be sure there'll be no
2384 * DMA transactions in flight for any of their siblings.
2385 *
2386 * So PCI devices (unless they're on the root bus) as well as
2387 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2388 * the 1:1 domain, just in _case_ one of their siblings turns out
2389 * not to be able to map all of memory.
2390 */
2391 if (!pci_is_pcie(pdev)) {
2392 if (!pci_is_root_bus(pdev->bus))
2393 return 0;
2394 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2395 return 0;
2396 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2397 return 0;
2398
2399 /*
2400 * At boot time, we don't yet know if devices will be 64-bit capable.
2401 * Assume that they will -- if they turn out not to be, then we can
2402 * take them out of the 1:1 domain later.
2403 */
2404 if (!startup) {
2405 /*
2406 * If the device's dma_mask is less than the system's memory
2407 * size then this is not a candidate for identity mapping.
2408 */
2409 u64 dma_mask = pdev->dma_mask;
2410
2411 if (pdev->dev.coherent_dma_mask &&
2412 pdev->dev.coherent_dma_mask < dma_mask)
2413 dma_mask = pdev->dev.coherent_dma_mask;
2414
2415 return dma_mask >= dma_get_required_mask(&pdev->dev);
2416 }
2417
2418 return 1;
2419 }
2420
2421 static int __init iommu_prepare_static_identity_mapping(int hw)
2422 {
2423 struct pci_dev *pdev = NULL;
2424 int ret;
2425
2426 ret = si_domain_init(hw);
2427 if (ret)
2428 return -EFAULT;
2429
2430 for_each_pci_dev(pdev) {
2431 if (iommu_should_identity_map(pdev, 1)) {
2432 ret = domain_add_dev_info(si_domain, pdev,
2433 hw ? CONTEXT_TT_PASS_THROUGH :
2434 CONTEXT_TT_MULTI_LEVEL);
2435 if (ret) {
2436 /* device not associated with an iommu */
2437 if (ret == -ENODEV)
2438 continue;
2439 return ret;
2440 }
2441 pr_info("IOMMU: %s identity mapping for device %s\n",
2442 hw ? "hardware" : "software", pci_name(pdev));
2443 }
2444 }
2445
2446 return 0;
2447 }
2448
2449 static int __init init_dmars(void)
2450 {
2451 struct dmar_drhd_unit *drhd;
2452 struct dmar_rmrr_unit *rmrr;
2453 struct pci_dev *pdev;
2454 struct intel_iommu *iommu;
2455 int i, ret;
2456
2457 /*
2458 * for each drhd
2459 * allocate root
2460 * initialize and program root entry to not present
2461 * endfor
2462 */
2463 for_each_drhd_unit(drhd) {
2464 /*
2465 * lock not needed as this is only incremented in the single
2466 * threaded kernel __init code path all other access are read
2467 * only
2468 */
2469 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2470 g_num_of_iommus++;
2471 continue;
2472 }
2473 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2474 IOMMU_UNITS_SUPPORTED);
2475 }
2476
2477 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2478 GFP_KERNEL);
2479 if (!g_iommus) {
2480 printk(KERN_ERR "Allocating global iommu array failed\n");
2481 ret = -ENOMEM;
2482 goto error;
2483 }
2484
2485 deferred_flush = kzalloc(g_num_of_iommus *
2486 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2487 if (!deferred_flush) {
2488 ret = -ENOMEM;
2489 goto error;
2490 }
2491
2492 for_each_drhd_unit(drhd) {
2493 if (drhd->ignored)
2494 continue;
2495
2496 iommu = drhd->iommu;
2497 g_iommus[iommu->seq_id] = iommu;
2498
2499 ret = iommu_init_domains(iommu);
2500 if (ret)
2501 goto error;
2502
2503 /*
2504 * TBD:
2505 * we could share the same root & context tables
2506 * among all IOMMU's. Need to Split it later.
2507 */
2508 ret = iommu_alloc_root_entry(iommu);
2509 if (ret) {
2510 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2511 goto error;
2512 }
2513 if (!ecap_pass_through(iommu->ecap))
2514 hw_pass_through = 0;
2515 }
2516
2517 /*
2518 * Start from the sane iommu hardware state.
2519 */
2520 for_each_drhd_unit(drhd) {
2521 if (drhd->ignored)
2522 continue;
2523
2524 iommu = drhd->iommu;
2525
2526 /*
2527 * If the queued invalidation is already initialized by us
2528 * (for example, while enabling interrupt-remapping) then
2529 * we got the things already rolling from a sane state.
2530 */
2531 if (iommu->qi)
2532 continue;
2533
2534 /*
2535 * Clear any previous faults.
2536 */
2537 dmar_fault(-1, iommu);
2538 /*
2539 * Disable queued invalidation if supported and already enabled
2540 * before OS handover.
2541 */
2542 dmar_disable_qi(iommu);
2543 }
2544
2545 for_each_drhd_unit(drhd) {
2546 if (drhd->ignored)
2547 continue;
2548
2549 iommu = drhd->iommu;
2550
2551 if (dmar_enable_qi(iommu)) {
2552 /*
2553 * Queued Invalidate not enabled, use Register Based
2554 * Invalidate
2555 */
2556 iommu->flush.flush_context = __iommu_flush_context;
2557 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2558 printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2559 "invalidation\n",
2560 iommu->seq_id,
2561 (unsigned long long)drhd->reg_base_addr);
2562 } else {
2563 iommu->flush.flush_context = qi_flush_context;
2564 iommu->flush.flush_iotlb = qi_flush_iotlb;
2565 printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2566 "invalidation\n",
2567 iommu->seq_id,
2568 (unsigned long long)drhd->reg_base_addr);
2569 }
2570 }
2571
2572 if (iommu_pass_through)
2573 iommu_identity_mapping |= IDENTMAP_ALL;
2574
2575 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2576 iommu_identity_mapping |= IDENTMAP_GFX;
2577 #endif
2578
2579 check_tylersburg_isoch();
2580
2581 /*
2582 * If pass through is not set or not enabled, setup context entries for
2583 * identity mappings for rmrr, gfx, and isa and may fall back to static
2584 * identity mapping if iommu_identity_mapping is set.
2585 */
2586 if (iommu_identity_mapping) {
2587 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2588 if (ret) {
2589 printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2590 goto error;
2591 }
2592 }
2593 /*
2594 * For each rmrr
2595 * for each dev attached to rmrr
2596 * do
2597 * locate drhd for dev, alloc domain for dev
2598 * allocate free domain
2599 * allocate page table entries for rmrr
2600 * if context not allocated for bus
2601 * allocate and init context
2602 * set present in root table for this bus
2603 * init context with domain, translation etc
2604 * endfor
2605 * endfor
2606 */
2607 printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2608 for_each_rmrr_units(rmrr) {
2609 for (i = 0; i < rmrr->devices_cnt; i++) {
2610 pdev = rmrr->devices[i];
2611 /*
2612 * some BIOS lists non-exist devices in DMAR
2613 * table.
2614 */
2615 if (!pdev)
2616 continue;
2617 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2618 if (ret)
2619 printk(KERN_ERR
2620 "IOMMU: mapping reserved region failed\n");
2621 }
2622 }
2623
2624 iommu_prepare_isa();
2625
2626 /*
2627 * for each drhd
2628 * enable fault log
2629 * global invalidate context cache
2630 * global invalidate iotlb
2631 * enable translation
2632 */
2633 for_each_drhd_unit(drhd) {
2634 if (drhd->ignored) {
2635 /*
2636 * we always have to disable PMRs or DMA may fail on
2637 * this device
2638 */
2639 if (force_on)
2640 iommu_disable_protect_mem_regions(drhd->iommu);
2641 continue;
2642 }
2643 iommu = drhd->iommu;
2644
2645 iommu_flush_write_buffer(iommu);
2646
2647 ret = dmar_set_interrupt(iommu);
2648 if (ret)
2649 goto error;
2650
2651 iommu_set_root_entry(iommu);
2652
2653 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2654 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2655
2656 ret = iommu_enable_translation(iommu);
2657 if (ret)
2658 goto error;
2659
2660 iommu_disable_protect_mem_regions(iommu);
2661 }
2662
2663 return 0;
2664 error:
2665 for_each_drhd_unit(drhd) {
2666 if (drhd->ignored)
2667 continue;
2668 iommu = drhd->iommu;
2669 free_iommu(iommu);
2670 }
2671 kfree(g_iommus);
2672 return ret;
2673 }
2674
2675 /* This takes a number of _MM_ pages, not VTD pages */
2676 static struct iova *intel_alloc_iova(struct device *dev,
2677 struct dmar_domain *domain,
2678 unsigned long nrpages, uint64_t dma_mask)
2679 {
2680 struct pci_dev *pdev = to_pci_dev(dev);
2681 struct iova *iova = NULL;
2682
2683 /* Restrict dma_mask to the width that the iommu can handle */
2684 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2685
2686 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2687 /*
2688 * First try to allocate an io virtual address in
2689 * DMA_BIT_MASK(32) and if that fails then try allocating
2690 * from higher range
2691 */
2692 iova = alloc_iova(&domain->iovad, nrpages,
2693 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2694 if (iova)
2695 return iova;
2696 }
2697 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2698 if (unlikely(!iova)) {
2699 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2700 nrpages, pci_name(pdev));
2701 return NULL;
2702 }
2703
2704 return iova;
2705 }
2706
2707 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2708 {
2709 struct dmar_domain *domain;
2710 int ret;
2711
2712 domain = get_domain_for_dev(pdev,
2713 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2714 if (!domain) {
2715 printk(KERN_ERR
2716 "Allocating domain for %s failed", pci_name(pdev));
2717 return NULL;
2718 }
2719
2720 /* make sure context mapping is ok */
2721 if (unlikely(!domain_context_mapped(pdev))) {
2722 ret = domain_context_mapping(domain, pdev,
2723 CONTEXT_TT_MULTI_LEVEL);
2724 if (ret) {
2725 printk(KERN_ERR
2726 "Domain context map for %s failed",
2727 pci_name(pdev));
2728 return NULL;
2729 }
2730 }
2731
2732 return domain;
2733 }
2734
2735 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2736 {
2737 struct device_domain_info *info;
2738
2739 /* No lock here, assumes no domain exit in normal case */
2740 info = dev->dev.archdata.iommu;
2741 if (likely(info))
2742 return info->domain;
2743
2744 return __get_valid_domain_for_dev(dev);
2745 }
2746
2747 static int iommu_dummy(struct pci_dev *pdev)
2748 {
2749 return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2750 }
2751
2752 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2753 static int iommu_no_mapping(struct device *dev)
2754 {
2755 struct pci_dev *pdev;
2756 int found;
2757
2758 if (unlikely(dev->bus != &pci_bus_type))
2759 return 1;
2760
2761 pdev = to_pci_dev(dev);
2762 if (iommu_dummy(pdev))
2763 return 1;
2764
2765 if (!iommu_identity_mapping)
2766 return 0;
2767
2768 found = identity_mapping(pdev);
2769 if (found) {
2770 if (iommu_should_identity_map(pdev, 0))
2771 return 1;
2772 else {
2773 /*
2774 * 32 bit DMA is removed from si_domain and fall back
2775 * to non-identity mapping.
2776 */
2777 domain_remove_one_dev_info(si_domain, pdev);
2778 printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2779 pci_name(pdev));
2780 return 0;
2781 }
2782 } else {
2783 /*
2784 * In case of a detached 64 bit DMA device from vm, the device
2785 * is put into si_domain for identity mapping.
2786 */
2787 if (iommu_should_identity_map(pdev, 0)) {
2788 int ret;
2789 ret = domain_add_dev_info(si_domain, pdev,
2790 hw_pass_through ?
2791 CONTEXT_TT_PASS_THROUGH :
2792 CONTEXT_TT_MULTI_LEVEL);
2793 if (!ret) {
2794 printk(KERN_INFO "64bit %s uses identity mapping\n",
2795 pci_name(pdev));
2796 return 1;
2797 }
2798 }
2799 }
2800
2801 return 0;
2802 }
2803
2804 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2805 size_t size, int dir, u64 dma_mask)
2806 {
2807 struct pci_dev *pdev = to_pci_dev(hwdev);
2808 struct dmar_domain *domain;
2809 phys_addr_t start_paddr;
2810 struct iova *iova;
2811 int prot = 0;
2812 int ret;
2813 struct intel_iommu *iommu;
2814 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2815
2816 BUG_ON(dir == DMA_NONE);
2817
2818 if (iommu_no_mapping(hwdev))
2819 return paddr;
2820
2821 domain = get_valid_domain_for_dev(pdev);
2822 if (!domain)
2823 return 0;
2824
2825 iommu = domain_get_iommu(domain);
2826 size = aligned_nrpages(paddr, size);
2827
2828 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2829 if (!iova)
2830 goto error;
2831
2832 /*
2833 * Check if DMAR supports zero-length reads on write only
2834 * mappings..
2835 */
2836 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2837 !cap_zlr(iommu->cap))
2838 prot |= DMA_PTE_READ;
2839 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2840 prot |= DMA_PTE_WRITE;
2841 /*
2842 * paddr - (paddr + size) might be partial page, we should map the whole
2843 * page. Note: if two part of one page are separately mapped, we
2844 * might have two guest_addr mapping to the same host paddr, but this
2845 * is not a big problem
2846 */
2847 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2848 mm_to_dma_pfn(paddr_pfn), size, prot);
2849 if (ret)
2850 goto error;
2851
2852 /* it's a non-present to present mapping. Only flush if caching mode */
2853 if (cap_caching_mode(iommu->cap))
2854 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2855 else
2856 iommu_flush_write_buffer(iommu);
2857
2858 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2859 start_paddr += paddr & ~PAGE_MASK;
2860 return start_paddr;
2861
2862 error:
2863 if (iova)
2864 __free_iova(&domain->iovad, iova);
2865 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2866 pci_name(pdev), size, (unsigned long long)paddr, dir);
2867 return 0;
2868 }
2869
2870 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2871 unsigned long offset, size_t size,
2872 enum dma_data_direction dir,
2873 struct dma_attrs *attrs)
2874 {
2875 return __intel_map_single(dev, page_to_phys(page) + offset, size,
2876 dir, to_pci_dev(dev)->dma_mask);
2877 }
2878
2879 static void flush_unmaps(void)
2880 {
2881 int i, j;
2882
2883 timer_on = 0;
2884
2885 /* just flush them all */
2886 for (i = 0; i < g_num_of_iommus; i++) {
2887 struct intel_iommu *iommu = g_iommus[i];
2888 if (!iommu)
2889 continue;
2890
2891 if (!deferred_flush[i].next)
2892 continue;
2893
2894 /* In caching mode, global flushes turn emulation expensive */
2895 if (!cap_caching_mode(iommu->cap))
2896 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2897 DMA_TLB_GLOBAL_FLUSH);
2898 for (j = 0; j < deferred_flush[i].next; j++) {
2899 unsigned long mask;
2900 struct iova *iova = deferred_flush[i].iova[j];
2901 struct dmar_domain *domain = deferred_flush[i].domain[j];
2902
2903 /* On real hardware multiple invalidations are expensive */
2904 if (cap_caching_mode(iommu->cap))
2905 iommu_flush_iotlb_psi(iommu, domain->id,
2906 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2907 else {
2908 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2909 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2910 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2911 }
2912 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2913 }
2914 deferred_flush[i].next = 0;
2915 }
2916
2917 list_size = 0;
2918 }
2919
2920 static void flush_unmaps_timeout(unsigned long data)
2921 {
2922 unsigned long flags;
2923
2924 spin_lock_irqsave(&async_umap_flush_lock, flags);
2925 flush_unmaps();
2926 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2927 }
2928
2929 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2930 {
2931 unsigned long flags;
2932 int next, iommu_id;
2933 struct intel_iommu *iommu;
2934
2935 spin_lock_irqsave(&async_umap_flush_lock, flags);
2936 if (list_size == HIGH_WATER_MARK)
2937 flush_unmaps();
2938
2939 iommu = domain_get_iommu(dom);
2940 iommu_id = iommu->seq_id;
2941
2942 next = deferred_flush[iommu_id].next;
2943 deferred_flush[iommu_id].domain[next] = dom;
2944 deferred_flush[iommu_id].iova[next] = iova;
2945 deferred_flush[iommu_id].next++;
2946
2947 if (!timer_on) {
2948 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2949 timer_on = 1;
2950 }
2951 list_size++;
2952 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2953 }
2954
2955 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2956 size_t size, enum dma_data_direction dir,
2957 struct dma_attrs *attrs)
2958 {
2959 struct pci_dev *pdev = to_pci_dev(dev);
2960 struct dmar_domain *domain;
2961 unsigned long start_pfn, last_pfn;
2962 struct iova *iova;
2963 struct intel_iommu *iommu;
2964
2965 if (iommu_no_mapping(dev))
2966 return;
2967
2968 domain = find_domain(pdev);
2969 BUG_ON(!domain);
2970
2971 iommu = domain_get_iommu(domain);
2972
2973 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2974 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2975 (unsigned long long)dev_addr))
2976 return;
2977
2978 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2979 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2980
2981 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2982 pci_name(pdev), start_pfn, last_pfn);
2983
2984 /* clear the whole page */
2985 dma_pte_clear_range(domain, start_pfn, last_pfn);
2986
2987 /* free page tables */
2988 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2989
2990 if (intel_iommu_strict) {
2991 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2992 last_pfn - start_pfn + 1, 0);
2993 /* free iova */
2994 __free_iova(&domain->iovad, iova);
2995 } else {
2996 add_unmap(domain, iova);
2997 /*
2998 * queue up the release of the unmap to save the 1/6th of the
2999 * cpu used up by the iotlb flush operation...
3000 */
3001 }
3002 }
3003
3004 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
3005 dma_addr_t *dma_handle, gfp_t flags,
3006 struct dma_attrs *attrs)
3007 {
3008 void *vaddr;
3009 int order;
3010
3011 size = PAGE_ALIGN(size);
3012 order = get_order(size);
3013
3014 if (!iommu_no_mapping(hwdev))
3015 flags &= ~(GFP_DMA | GFP_DMA32);
3016 else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
3017 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
3018 flags |= GFP_DMA;
3019 else
3020 flags |= GFP_DMA32;
3021 }
3022
3023 vaddr = (void *)__get_free_pages(flags, order);
3024 if (!vaddr)
3025 return NULL;
3026 memset(vaddr, 0, size);
3027
3028 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
3029 DMA_BIDIRECTIONAL,
3030 hwdev->coherent_dma_mask);
3031 if (*dma_handle)
3032 return vaddr;
3033 free_pages((unsigned long)vaddr, order);
3034 return NULL;
3035 }
3036
3037 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3038 dma_addr_t dma_handle, struct dma_attrs *attrs)
3039 {
3040 int order;
3041
3042 size = PAGE_ALIGN(size);
3043 order = get_order(size);
3044
3045 intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3046 free_pages((unsigned long)vaddr, order);
3047 }
3048
3049 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3050 int nelems, enum dma_data_direction dir,
3051 struct dma_attrs *attrs)
3052 {
3053 struct pci_dev *pdev = to_pci_dev(hwdev);
3054 struct dmar_domain *domain;
3055 unsigned long start_pfn, last_pfn;
3056 struct iova *iova;
3057 struct intel_iommu *iommu;
3058
3059 if (iommu_no_mapping(hwdev))
3060 return;
3061
3062 domain = find_domain(pdev);
3063 BUG_ON(!domain);
3064
3065 iommu = domain_get_iommu(domain);
3066
3067 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3068 if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3069 (unsigned long long)sglist[0].dma_address))
3070 return;
3071
3072 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3073 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3074
3075 /* clear the whole page */
3076 dma_pte_clear_range(domain, start_pfn, last_pfn);
3077
3078 /* free page tables */
3079 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3080
3081 if (intel_iommu_strict) {
3082 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3083 last_pfn - start_pfn + 1, 0);
3084 /* free iova */
3085 __free_iova(&domain->iovad, iova);
3086 } else {
3087 add_unmap(domain, iova);
3088 /*
3089 * queue up the release of the unmap to save the 1/6th of the
3090 * cpu used up by the iotlb flush operation...
3091 */
3092 }
3093 }
3094
3095 static int intel_nontranslate_map_sg(struct device *hddev,
3096 struct scatterlist *sglist, int nelems, int dir)
3097 {
3098 int i;
3099 struct scatterlist *sg;
3100
3101 for_each_sg(sglist, sg, nelems, i) {
3102 BUG_ON(!sg_page(sg));
3103 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3104 sg->dma_length = sg->length;
3105 }
3106 return nelems;
3107 }
3108
3109 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3110 enum dma_data_direction dir, struct dma_attrs *attrs)
3111 {
3112 int i;
3113 struct pci_dev *pdev = to_pci_dev(hwdev);
3114 struct dmar_domain *domain;
3115 size_t size = 0;
3116 int prot = 0;
3117 struct iova *iova = NULL;
3118 int ret;
3119 struct scatterlist *sg;
3120 unsigned long start_vpfn;
3121 struct intel_iommu *iommu;
3122
3123 BUG_ON(dir == DMA_NONE);
3124 if (iommu_no_mapping(hwdev))
3125 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3126
3127 domain = get_valid_domain_for_dev(pdev);
3128 if (!domain)
3129 return 0;
3130
3131 iommu = domain_get_iommu(domain);
3132
3133 for_each_sg(sglist, sg, nelems, i)
3134 size += aligned_nrpages(sg->offset, sg->length);
3135
3136 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3137 pdev->dma_mask);
3138 if (!iova) {
3139 sglist->dma_length = 0;
3140 return 0;
3141 }
3142
3143 /*
3144 * Check if DMAR supports zero-length reads on write only
3145 * mappings..
3146 */
3147 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3148 !cap_zlr(iommu->cap))
3149 prot |= DMA_PTE_READ;
3150 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3151 prot |= DMA_PTE_WRITE;
3152
3153 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3154
3155 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3156 if (unlikely(ret)) {
3157 /* clear the page */
3158 dma_pte_clear_range(domain, start_vpfn,
3159 start_vpfn + size - 1);
3160 /* free page tables */
3161 dma_pte_free_pagetable(domain, start_vpfn,
3162 start_vpfn + size - 1);
3163 /* free iova */
3164 __free_iova(&domain->iovad, iova);
3165 return 0;
3166 }
3167
3168 /* it's a non-present to present mapping. Only flush if caching mode */
3169 if (cap_caching_mode(iommu->cap))
3170 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3171 else
3172 iommu_flush_write_buffer(iommu);
3173
3174 return nelems;
3175 }
3176
3177 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3178 {
3179 return !dma_addr;
3180 }
3181
3182 struct dma_map_ops intel_dma_ops = {
3183 .alloc = intel_alloc_coherent,
3184 .free = intel_free_coherent,
3185 .map_sg = intel_map_sg,
3186 .unmap_sg = intel_unmap_sg,
3187 .map_page = intel_map_page,
3188 .unmap_page = intel_unmap_page,
3189 .mapping_error = intel_mapping_error,
3190 };
3191
3192 static inline int iommu_domain_cache_init(void)
3193 {
3194 int ret = 0;
3195
3196 iommu_domain_cache = kmem_cache_create("iommu_domain",
3197 sizeof(struct dmar_domain),
3198 0,
3199 SLAB_HWCACHE_ALIGN,
3200
3201 NULL);
3202 if (!iommu_domain_cache) {
3203 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3204 ret = -ENOMEM;
3205 }
3206
3207 return ret;
3208 }
3209
3210 static inline int iommu_devinfo_cache_init(void)
3211 {
3212 int ret = 0;
3213
3214 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3215 sizeof(struct device_domain_info),
3216 0,
3217 SLAB_HWCACHE_ALIGN,
3218 NULL);
3219 if (!iommu_devinfo_cache) {
3220 printk(KERN_ERR "Couldn't create devinfo cache\n");
3221 ret = -ENOMEM;
3222 }
3223
3224 return ret;
3225 }
3226
3227 static inline int iommu_iova_cache_init(void)
3228 {
3229 int ret = 0;
3230
3231 iommu_iova_cache = kmem_cache_create("iommu_iova",
3232 sizeof(struct iova),
3233 0,
3234 SLAB_HWCACHE_ALIGN,
3235 NULL);
3236 if (!iommu_iova_cache) {
3237 printk(KERN_ERR "Couldn't create iova cache\n");
3238 ret = -ENOMEM;
3239 }
3240
3241 return ret;
3242 }
3243
3244 static int __init iommu_init_mempool(void)
3245 {
3246 int ret;
3247 ret = iommu_iova_cache_init();
3248 if (ret)
3249 return ret;
3250
3251 ret = iommu_domain_cache_init();
3252 if (ret)
3253 goto domain_error;
3254
3255 ret = iommu_devinfo_cache_init();
3256 if (!ret)
3257 return ret;
3258
3259 kmem_cache_destroy(iommu_domain_cache);
3260 domain_error:
3261 kmem_cache_destroy(iommu_iova_cache);
3262
3263 return -ENOMEM;
3264 }
3265
3266 static void __init iommu_exit_mempool(void)
3267 {
3268 kmem_cache_destroy(iommu_devinfo_cache);
3269 kmem_cache_destroy(iommu_domain_cache);
3270 kmem_cache_destroy(iommu_iova_cache);
3271
3272 }
3273
3274 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3275 {
3276 struct dmar_drhd_unit *drhd;
3277 u32 vtbar;
3278 int rc;
3279
3280 /* We know that this device on this chipset has its own IOMMU.
3281 * If we find it under a different IOMMU, then the BIOS is lying
3282 * to us. Hope that the IOMMU for this device is actually
3283 * disabled, and it needs no translation...
3284 */
3285 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3286 if (rc) {
3287 /* "can't" happen */
3288 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3289 return;
3290 }
3291 vtbar &= 0xffff0000;
3292
3293 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3294 drhd = dmar_find_matched_drhd_unit(pdev);
3295 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3296 TAINT_FIRMWARE_WORKAROUND,
3297 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3298 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3299 }
3300 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3301
3302 static void __init init_no_remapping_devices(void)
3303 {
3304 struct dmar_drhd_unit *drhd;
3305
3306 for_each_drhd_unit(drhd) {
3307 if (!drhd->include_all) {
3308 int i;
3309 for (i = 0; i < drhd->devices_cnt; i++)
3310 if (drhd->devices[i] != NULL)
3311 break;
3312 /* ignore DMAR unit if no pci devices exist */
3313 if (i == drhd->devices_cnt)
3314 drhd->ignored = 1;
3315 }
3316 }
3317
3318 for_each_drhd_unit(drhd) {
3319 int i;
3320 if (drhd->ignored || drhd->include_all)
3321 continue;
3322
3323 for (i = 0; i < drhd->devices_cnt; i++)
3324 if (drhd->devices[i] &&
3325 !IS_GFX_DEVICE(drhd->devices[i]))
3326 break;
3327
3328 if (i < drhd->devices_cnt)
3329 continue;
3330
3331 /* This IOMMU has *only* gfx devices. Either bypass it or
3332 set the gfx_mapped flag, as appropriate */
3333 if (dmar_map_gfx) {
3334 intel_iommu_gfx_mapped = 1;
3335 } else {
3336 drhd->ignored = 1;
3337 for (i = 0; i < drhd->devices_cnt; i++) {
3338 if (!drhd->devices[i])
3339 continue;
3340 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3341 }
3342 }
3343 }
3344 }
3345
3346 #ifdef CONFIG_SUSPEND
3347 static int init_iommu_hw(void)
3348 {
3349 struct dmar_drhd_unit *drhd;
3350 struct intel_iommu *iommu = NULL;
3351
3352 for_each_active_iommu(iommu, drhd)
3353 if (iommu->qi)
3354 dmar_reenable_qi(iommu);
3355
3356 for_each_iommu(iommu, drhd) {
3357 if (drhd->ignored) {
3358 /*
3359 * we always have to disable PMRs or DMA may fail on
3360 * this device
3361 */
3362 if (force_on)
3363 iommu_disable_protect_mem_regions(iommu);
3364 continue;
3365 }
3366
3367 iommu_flush_write_buffer(iommu);
3368
3369 iommu_set_root_entry(iommu);
3370
3371 iommu->flush.flush_context(iommu, 0, 0, 0,
3372 DMA_CCMD_GLOBAL_INVL);
3373 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3374 DMA_TLB_GLOBAL_FLUSH);
3375 if (iommu_enable_translation(iommu))
3376 return 1;
3377 iommu_disable_protect_mem_regions(iommu);
3378 }
3379
3380 return 0;
3381 }
3382
3383 static void iommu_flush_all(void)
3384 {
3385 struct dmar_drhd_unit *drhd;
3386 struct intel_iommu *iommu;
3387
3388 for_each_active_iommu(iommu, drhd) {
3389 iommu->flush.flush_context(iommu, 0, 0, 0,
3390 DMA_CCMD_GLOBAL_INVL);
3391 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3392 DMA_TLB_GLOBAL_FLUSH);
3393 }
3394 }
3395
3396 static int iommu_suspend(void)
3397 {
3398 struct dmar_drhd_unit *drhd;
3399 struct intel_iommu *iommu = NULL;
3400 unsigned long flag;
3401
3402 for_each_active_iommu(iommu, drhd) {
3403 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3404 GFP_ATOMIC);
3405 if (!iommu->iommu_state)
3406 goto nomem;
3407 }
3408
3409 iommu_flush_all();
3410
3411 for_each_active_iommu(iommu, drhd) {
3412 iommu_disable_translation(iommu);
3413
3414 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3415
3416 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3417 readl(iommu->reg + DMAR_FECTL_REG);
3418 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3419 readl(iommu->reg + DMAR_FEDATA_REG);
3420 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3421 readl(iommu->reg + DMAR_FEADDR_REG);
3422 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3423 readl(iommu->reg + DMAR_FEUADDR_REG);
3424
3425 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3426 }
3427 return 0;
3428
3429 nomem:
3430 for_each_active_iommu(iommu, drhd)
3431 kfree(iommu->iommu_state);
3432
3433 return -ENOMEM;
3434 }
3435
3436 static void iommu_resume(void)
3437 {
3438 struct dmar_drhd_unit *drhd;
3439 struct intel_iommu *iommu = NULL;
3440 unsigned long flag;
3441
3442 if (init_iommu_hw()) {
3443 if (force_on)
3444 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3445 else
3446 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3447 return;
3448 }
3449
3450 for_each_active_iommu(iommu, drhd) {
3451
3452 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3453
3454 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3455 iommu->reg + DMAR_FECTL_REG);
3456 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3457 iommu->reg + DMAR_FEDATA_REG);
3458 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3459 iommu->reg + DMAR_FEADDR_REG);
3460 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3461 iommu->reg + DMAR_FEUADDR_REG);
3462
3463 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3464 }
3465
3466 for_each_active_iommu(iommu, drhd)
3467 kfree(iommu->iommu_state);
3468 }
3469
3470 static struct syscore_ops iommu_syscore_ops = {
3471 .resume = iommu_resume,
3472 .suspend = iommu_suspend,
3473 };
3474
3475 static void __init init_iommu_pm_ops(void)
3476 {
3477 register_syscore_ops(&iommu_syscore_ops);
3478 }
3479
3480 #else
3481 static inline void init_iommu_pm_ops(void) {}
3482 #endif /* CONFIG_PM */
3483
3484 LIST_HEAD(dmar_rmrr_units);
3485
3486 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3487 {
3488 list_add(&rmrr->list, &dmar_rmrr_units);
3489 }
3490
3491
3492 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3493 {
3494 struct acpi_dmar_reserved_memory *rmrr;
3495 struct dmar_rmrr_unit *rmrru;
3496
3497 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3498 if (!rmrru)
3499 return -ENOMEM;
3500
3501 rmrru->hdr = header;
3502 rmrr = (struct acpi_dmar_reserved_memory *)header;
3503 rmrru->base_address = rmrr->base_address;
3504 rmrru->end_address = rmrr->end_address;
3505
3506 dmar_register_rmrr_unit(rmrru);
3507 return 0;
3508 }
3509
3510 static int __init
3511 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3512 {
3513 struct acpi_dmar_reserved_memory *rmrr;
3514 int ret;
3515
3516 rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3517 ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3518 ((void *)rmrr) + rmrr->header.length,
3519 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3520
3521 if (ret || (rmrru->devices_cnt == 0)) {
3522 list_del(&rmrru->list);
3523 kfree(rmrru);
3524 }
3525 return ret;
3526 }
3527
3528 static LIST_HEAD(dmar_atsr_units);
3529
3530 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3531 {
3532 struct acpi_dmar_atsr *atsr;
3533 struct dmar_atsr_unit *atsru;
3534
3535 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3536 atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3537 if (!atsru)
3538 return -ENOMEM;
3539
3540 atsru->hdr = hdr;
3541 atsru->include_all = atsr->flags & 0x1;
3542
3543 list_add(&atsru->list, &dmar_atsr_units);
3544
3545 return 0;
3546 }
3547
3548 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3549 {
3550 int rc;
3551 struct acpi_dmar_atsr *atsr;
3552
3553 if (atsru->include_all)
3554 return 0;
3555
3556 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3557 rc = dmar_parse_dev_scope((void *)(atsr + 1),
3558 (void *)atsr + atsr->header.length,
3559 &atsru->devices_cnt, &atsru->devices,
3560 atsr->segment);
3561 if (rc || !atsru->devices_cnt) {
3562 list_del(&atsru->list);
3563 kfree(atsru);
3564 }
3565
3566 return rc;
3567 }
3568
3569 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3570 {
3571 int i;
3572 struct pci_bus *bus;
3573 struct acpi_dmar_atsr *atsr;
3574 struct dmar_atsr_unit *atsru;
3575
3576 dev = pci_physfn(dev);
3577
3578 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3579 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3580 if (atsr->segment == pci_domain_nr(dev->bus))
3581 goto found;
3582 }
3583
3584 return 0;
3585
3586 found:
3587 for (bus = dev->bus; bus; bus = bus->parent) {
3588 struct pci_dev *bridge = bus->self;
3589
3590 if (!bridge || !pci_is_pcie(bridge) ||
3591 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3592 return 0;
3593
3594 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) {
3595 for (i = 0; i < atsru->devices_cnt; i++)
3596 if (atsru->devices[i] == bridge)
3597 return 1;
3598 break;
3599 }
3600 }
3601
3602 if (atsru->include_all)
3603 return 1;
3604
3605 return 0;
3606 }
3607
3608 int __init dmar_parse_rmrr_atsr_dev(void)
3609 {
3610 struct dmar_rmrr_unit *rmrr, *rmrr_n;
3611 struct dmar_atsr_unit *atsr, *atsr_n;
3612 int ret = 0;
3613
3614 list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3615 ret = rmrr_parse_dev(rmrr);
3616 if (ret)
3617 return ret;
3618 }
3619
3620 list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3621 ret = atsr_parse_dev(atsr);
3622 if (ret)
3623 return ret;
3624 }
3625
3626 return ret;
3627 }
3628
3629 /*
3630 * Here we only respond to action of unbound device from driver.
3631 *
3632 * Added device is not attached to its DMAR domain here yet. That will happen
3633 * when mapping the device to iova.
3634 */
3635 static int device_notifier(struct notifier_block *nb,
3636 unsigned long action, void *data)
3637 {
3638 struct device *dev = data;
3639 struct pci_dev *pdev = to_pci_dev(dev);
3640 struct dmar_domain *domain;
3641
3642 if (iommu_no_mapping(dev))
3643 return 0;
3644
3645 domain = find_domain(pdev);
3646 if (!domain)
3647 return 0;
3648
3649 if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3650 domain_remove_one_dev_info(domain, pdev);
3651
3652 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3653 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3654 list_empty(&domain->devices))
3655 domain_exit(domain);
3656 }
3657
3658 return 0;
3659 }
3660
3661 static struct notifier_block device_nb = {
3662 .notifier_call = device_notifier,
3663 };
3664
3665 int __init intel_iommu_init(void)
3666 {
3667 int ret = 0;
3668
3669 /* VT-d is required for a TXT/tboot launch, so enforce that */
3670 force_on = tboot_force_iommu();
3671
3672 if (dmar_table_init()) {
3673 if (force_on)
3674 panic("tboot: Failed to initialize DMAR table\n");
3675 return -ENODEV;
3676 }
3677
3678 if (dmar_dev_scope_init() < 0) {
3679 if (force_on)
3680 panic("tboot: Failed to initialize DMAR device scope\n");
3681 return -ENODEV;
3682 }
3683
3684 if (no_iommu || dmar_disabled)
3685 return -ENODEV;
3686
3687 if (iommu_init_mempool()) {
3688 if (force_on)
3689 panic("tboot: Failed to initialize iommu memory\n");
3690 return -ENODEV;
3691 }
3692
3693 if (list_empty(&dmar_rmrr_units))
3694 printk(KERN_INFO "DMAR: No RMRR found\n");
3695
3696 if (list_empty(&dmar_atsr_units))
3697 printk(KERN_INFO "DMAR: No ATSR found\n");
3698
3699 if (dmar_init_reserved_ranges()) {
3700 if (force_on)
3701 panic("tboot: Failed to reserve iommu ranges\n");
3702 return -ENODEV;
3703 }
3704
3705 init_no_remapping_devices();
3706
3707 ret = init_dmars();
3708 if (ret) {
3709 if (force_on)
3710 panic("tboot: Failed to initialize DMARs\n");
3711 printk(KERN_ERR "IOMMU: dmar init failed\n");
3712 put_iova_domain(&reserved_iova_list);
3713 iommu_exit_mempool();
3714 return ret;
3715 }
3716 printk(KERN_INFO
3717 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3718
3719 init_timer(&unmap_timer);
3720 #ifdef CONFIG_SWIOTLB
3721 swiotlb = 0;
3722 #endif
3723 dma_ops = &intel_dma_ops;
3724
3725 init_iommu_pm_ops();
3726
3727 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3728
3729 bus_register_notifier(&pci_bus_type, &device_nb);
3730
3731 intel_iommu_enabled = 1;
3732
3733 return 0;
3734 }
3735
3736 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3737 struct pci_dev *pdev)
3738 {
3739 struct pci_dev *tmp, *parent;
3740
3741 if (!iommu || !pdev)
3742 return;
3743
3744 /* dependent device detach */
3745 tmp = pci_find_upstream_pcie_bridge(pdev);
3746 /* Secondary interface's bus number and devfn 0 */
3747 if (tmp) {
3748 parent = pdev->bus->self;
3749 while (parent != tmp) {
3750 iommu_detach_dev(iommu, parent->bus->number,
3751 parent->devfn);
3752 parent = parent->bus->self;
3753 }
3754 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3755 iommu_detach_dev(iommu,
3756 tmp->subordinate->number, 0);
3757 else /* this is a legacy PCI bridge */
3758 iommu_detach_dev(iommu, tmp->bus->number,
3759 tmp->devfn);
3760 }
3761 }
3762
3763 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3764 struct pci_dev *pdev)
3765 {
3766 struct device_domain_info *info;
3767 struct intel_iommu *iommu;
3768 unsigned long flags;
3769 int found = 0;
3770 struct list_head *entry, *tmp;
3771
3772 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3773 pdev->devfn);
3774 if (!iommu)
3775 return;
3776
3777 spin_lock_irqsave(&device_domain_lock, flags);
3778 list_for_each_safe(entry, tmp, &domain->devices) {
3779 info = list_entry(entry, struct device_domain_info, link);
3780 if (info->segment == pci_domain_nr(pdev->bus) &&
3781 info->bus == pdev->bus->number &&
3782 info->devfn == pdev->devfn) {
3783 unlink_domain_info(info);
3784 spin_unlock_irqrestore(&device_domain_lock, flags);
3785
3786 iommu_disable_dev_iotlb(info);
3787 iommu_detach_dev(iommu, info->bus, info->devfn);
3788 iommu_detach_dependent_devices(iommu, pdev);
3789 free_devinfo_mem(info);
3790
3791 spin_lock_irqsave(&device_domain_lock, flags);
3792
3793 if (found)
3794 break;
3795 else
3796 continue;
3797 }
3798
3799 /* if there is no other devices under the same iommu
3800 * owned by this domain, clear this iommu in iommu_bmp
3801 * update iommu count and coherency
3802 */
3803 if (iommu == device_to_iommu(info->segment, info->bus,
3804 info->devfn))
3805 found = 1;
3806 }
3807
3808 spin_unlock_irqrestore(&device_domain_lock, flags);
3809
3810 if (found == 0) {
3811 unsigned long tmp_flags;
3812 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3813 clear_bit(iommu->seq_id, domain->iommu_bmp);
3814 domain->iommu_count--;
3815 domain_update_iommu_cap(domain);
3816 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3817
3818 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3819 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3820 spin_lock_irqsave(&iommu->lock, tmp_flags);
3821 clear_bit(domain->id, iommu->domain_ids);
3822 iommu->domains[domain->id] = NULL;
3823 spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3824 }
3825 }
3826 }
3827
3828 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3829 {
3830 struct device_domain_info *info;
3831 struct intel_iommu *iommu;
3832 unsigned long flags1, flags2;
3833
3834 spin_lock_irqsave(&device_domain_lock, flags1);
3835 while (!list_empty(&domain->devices)) {
3836 info = list_entry(domain->devices.next,
3837 struct device_domain_info, link);
3838 unlink_domain_info(info);
3839 spin_unlock_irqrestore(&device_domain_lock, flags1);
3840
3841 iommu_disable_dev_iotlb(info);
3842 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3843 iommu_detach_dev(iommu, info->bus, info->devfn);
3844 iommu_detach_dependent_devices(iommu, info->dev);
3845
3846 /* clear this iommu in iommu_bmp, update iommu count
3847 * and capabilities
3848 */
3849 spin_lock_irqsave(&domain->iommu_lock, flags2);
3850 if (test_and_clear_bit(iommu->seq_id,
3851 domain->iommu_bmp)) {
3852 domain->iommu_count--;
3853 domain_update_iommu_cap(domain);
3854 }
3855 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3856
3857 free_devinfo_mem(info);
3858 spin_lock_irqsave(&device_domain_lock, flags1);
3859 }
3860 spin_unlock_irqrestore(&device_domain_lock, flags1);
3861 }
3862
3863 /* domain id for virtual machine, it won't be set in context */
3864 static unsigned long vm_domid;
3865
3866 static struct dmar_domain *iommu_alloc_vm_domain(void)
3867 {
3868 struct dmar_domain *domain;
3869
3870 domain = alloc_domain_mem();
3871 if (!domain)
3872 return NULL;
3873
3874 domain->id = vm_domid++;
3875 domain->nid = -1;
3876 memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3877 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3878
3879 return domain;
3880 }
3881
3882 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3883 {
3884 int adjust_width;
3885
3886 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3887 spin_lock_init(&domain->iommu_lock);
3888
3889 domain_reserve_special_ranges(domain);
3890
3891 /* calculate AGAW */
3892 domain->gaw = guest_width;
3893 adjust_width = guestwidth_to_adjustwidth(guest_width);
3894 domain->agaw = width_to_agaw(adjust_width);
3895
3896 INIT_LIST_HEAD(&domain->devices);
3897
3898 domain->iommu_count = 0;
3899 domain->iommu_coherency = 0;
3900 domain->iommu_snooping = 0;
3901 domain->iommu_superpage = 0;
3902 domain->max_addr = 0;
3903 domain->nid = -1;
3904
3905 /* always allocate the top pgd */
3906 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3907 if (!domain->pgd)
3908 return -ENOMEM;
3909 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3910 return 0;
3911 }
3912
3913 static void iommu_free_vm_domain(struct dmar_domain *domain)
3914 {
3915 unsigned long flags;
3916 struct dmar_drhd_unit *drhd;
3917 struct intel_iommu *iommu;
3918 unsigned long i;
3919 unsigned long ndomains;
3920
3921 for_each_drhd_unit(drhd) {
3922 if (drhd->ignored)
3923 continue;
3924 iommu = drhd->iommu;
3925
3926 ndomains = cap_ndoms(iommu->cap);
3927 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3928 if (iommu->domains[i] == domain) {
3929 spin_lock_irqsave(&iommu->lock, flags);
3930 clear_bit(i, iommu->domain_ids);
3931 iommu->domains[i] = NULL;
3932 spin_unlock_irqrestore(&iommu->lock, flags);
3933 break;
3934 }
3935 }
3936 }
3937 }
3938
3939 static void vm_domain_exit(struct dmar_domain *domain)
3940 {
3941 /* Domain 0 is reserved, so dont process it */
3942 if (!domain)
3943 return;
3944
3945 vm_domain_remove_all_dev_info(domain);
3946 /* destroy iovas */
3947 put_iova_domain(&domain->iovad);
3948
3949 /* clear ptes */
3950 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3951
3952 /* free page tables */
3953 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3954
3955 iommu_free_vm_domain(domain);
3956 free_domain_mem(domain);
3957 }
3958
3959 static int intel_iommu_domain_init(struct iommu_domain *domain)
3960 {
3961 struct dmar_domain *dmar_domain;
3962
3963 dmar_domain = iommu_alloc_vm_domain();
3964 if (!dmar_domain) {
3965 printk(KERN_ERR
3966 "intel_iommu_domain_init: dmar_domain == NULL\n");
3967 return -ENOMEM;
3968 }
3969 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3970 printk(KERN_ERR
3971 "intel_iommu_domain_init() failed\n");
3972 vm_domain_exit(dmar_domain);
3973 return -ENOMEM;
3974 }
3975 domain_update_iommu_cap(dmar_domain);
3976 domain->priv = dmar_domain;
3977
3978 domain->geometry.aperture_start = 0;
3979 domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3980 domain->geometry.force_aperture = true;
3981
3982 return 0;
3983 }
3984
3985 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3986 {
3987 struct dmar_domain *dmar_domain = domain->priv;
3988
3989 domain->priv = NULL;
3990 vm_domain_exit(dmar_domain);
3991 }
3992
3993 static int intel_iommu_attach_device(struct iommu_domain *domain,
3994 struct device *dev)
3995 {
3996 struct dmar_domain *dmar_domain = domain->priv;
3997 struct pci_dev *pdev = to_pci_dev(dev);
3998 struct intel_iommu *iommu;
3999 int addr_width;
4000
4001 /* normally pdev is not mapped */
4002 if (unlikely(domain_context_mapped(pdev))) {
4003 struct dmar_domain *old_domain;
4004
4005 old_domain = find_domain(pdev);
4006 if (old_domain) {
4007 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
4008 dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4009 domain_remove_one_dev_info(old_domain, pdev);
4010 else
4011 domain_remove_dev_info(old_domain);
4012 }
4013 }
4014
4015 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4016 pdev->devfn);
4017 if (!iommu)
4018 return -ENODEV;
4019
4020 /* check if this iommu agaw is sufficient for max mapped address */
4021 addr_width = agaw_to_width(iommu->agaw);
4022 if (addr_width > cap_mgaw(iommu->cap))
4023 addr_width = cap_mgaw(iommu->cap);
4024
4025 if (dmar_domain->max_addr > (1LL << addr_width)) {
4026 printk(KERN_ERR "%s: iommu width (%d) is not "
4027 "sufficient for the mapped address (%llx)\n",
4028 __func__, addr_width, dmar_domain->max_addr);
4029 return -EFAULT;
4030 }
4031 dmar_domain->gaw = addr_width;
4032
4033 /*
4034 * Knock out extra levels of page tables if necessary
4035 */
4036 while (iommu->agaw < dmar_domain->agaw) {
4037 struct dma_pte *pte;
4038
4039 pte = dmar_domain->pgd;
4040 if (dma_pte_present(pte)) {
4041 dmar_domain->pgd = (struct dma_pte *)
4042 phys_to_virt(dma_pte_addr(pte));
4043 free_pgtable_page(pte);
4044 }
4045 dmar_domain->agaw--;
4046 }
4047
4048 return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4049 }
4050
4051 static void intel_iommu_detach_device(struct iommu_domain *domain,
4052 struct device *dev)
4053 {
4054 struct dmar_domain *dmar_domain = domain->priv;
4055 struct pci_dev *pdev = to_pci_dev(dev);
4056
4057 domain_remove_one_dev_info(dmar_domain, pdev);
4058 }
4059
4060 static int intel_iommu_map(struct iommu_domain *domain,
4061 unsigned long iova, phys_addr_t hpa,
4062 size_t size, int iommu_prot)
4063 {
4064 struct dmar_domain *dmar_domain = domain->priv;
4065 u64 max_addr;
4066 int prot = 0;
4067 int ret;
4068
4069 if (iommu_prot & IOMMU_READ)
4070 prot |= DMA_PTE_READ;
4071 if (iommu_prot & IOMMU_WRITE)
4072 prot |= DMA_PTE_WRITE;
4073 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4074 prot |= DMA_PTE_SNP;
4075
4076 max_addr = iova + size;
4077 if (dmar_domain->max_addr < max_addr) {
4078 u64 end;
4079
4080 /* check if minimum agaw is sufficient for mapped address */
4081 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4082 if (end < max_addr) {
4083 printk(KERN_ERR "%s: iommu width (%d) is not "
4084 "sufficient for the mapped address (%llx)\n",
4085 __func__, dmar_domain->gaw, max_addr);
4086 return -EFAULT;
4087 }
4088 dmar_domain->max_addr = max_addr;
4089 }
4090 /* Round up size to next multiple of PAGE_SIZE, if it and
4091 the low bits of hpa would take us onto the next page */
4092 size = aligned_nrpages(hpa, size);
4093 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4094 hpa >> VTD_PAGE_SHIFT, size, prot);
4095 return ret;
4096 }
4097
4098 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4099 unsigned long iova, size_t size)
4100 {
4101 struct dmar_domain *dmar_domain = domain->priv;
4102 int order;
4103
4104 order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4105 (iova + size - 1) >> VTD_PAGE_SHIFT);
4106
4107 if (dmar_domain->max_addr == iova + size)
4108 dmar_domain->max_addr = iova;
4109
4110 return PAGE_SIZE << order;
4111 }
4112
4113 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4114 unsigned long iova)
4115 {
4116 struct dmar_domain *dmar_domain = domain->priv;
4117 struct dma_pte *pte;
4118 u64 phys = 0;
4119
4120 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4121 if (pte)
4122 phys = dma_pte_addr(pte);
4123
4124 return phys;
4125 }
4126
4127 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4128 unsigned long cap)
4129 {
4130 struct dmar_domain *dmar_domain = domain->priv;
4131
4132 if (cap == IOMMU_CAP_CACHE_COHERENCY)
4133 return dmar_domain->iommu_snooping;
4134 if (cap == IOMMU_CAP_INTR_REMAP)
4135 return irq_remapping_enabled;
4136
4137 return 0;
4138 }
4139
4140 static void swap_pci_ref(struct pci_dev **from, struct pci_dev *to)
4141 {
4142 pci_dev_put(*from);
4143 *from = to;
4144 }
4145
4146 #define REQ_ACS_FLAGS (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4147
4148 static int intel_iommu_add_device(struct device *dev)
4149 {
4150 struct pci_dev *pdev = to_pci_dev(dev);
4151 struct pci_dev *bridge, *dma_pdev = NULL;
4152 struct iommu_group *group;
4153 int ret;
4154
4155 if (!device_to_iommu(pci_domain_nr(pdev->bus),
4156 pdev->bus->number, pdev->devfn))
4157 return -ENODEV;
4158
4159 bridge = pci_find_upstream_pcie_bridge(pdev);
4160 if (bridge) {
4161 if (pci_is_pcie(bridge))
4162 dma_pdev = pci_get_domain_bus_and_slot(
4163 pci_domain_nr(pdev->bus),
4164 bridge->subordinate->number, 0);
4165 if (!dma_pdev)
4166 dma_pdev = pci_dev_get(bridge);
4167 } else
4168 dma_pdev = pci_dev_get(pdev);
4169
4170 /* Account for quirked devices */
4171 swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4172
4173 /*
4174 * If it's a multifunction device that does not support our
4175 * required ACS flags, add to the same group as function 0.
4176 */
4177 if (dma_pdev->multifunction &&
4178 !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS))
4179 swap_pci_ref(&dma_pdev,
4180 pci_get_slot(dma_pdev->bus,
4181 PCI_DEVFN(PCI_SLOT(dma_pdev->devfn),
4182 0)));
4183
4184 /*
4185 * Devices on the root bus go through the iommu. If that's not us,
4186 * find the next upstream device and test ACS up to the root bus.
4187 * Finding the next device may require skipping virtual buses.
4188 */
4189 while (!pci_is_root_bus(dma_pdev->bus)) {
4190 struct pci_bus *bus = dma_pdev->bus;
4191
4192 while (!bus->self) {
4193 if (!pci_is_root_bus(bus))
4194 bus = bus->parent;
4195 else
4196 goto root_bus;
4197 }
4198
4199 if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4200 break;
4201
4202 swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4203 }
4204
4205 root_bus:
4206 group = iommu_group_get(&dma_pdev->dev);
4207 pci_dev_put(dma_pdev);
4208 if (!group) {
4209 group = iommu_group_alloc();
4210 if (IS_ERR(group))
4211 return PTR_ERR(group);
4212 }
4213
4214 ret = iommu_group_add_device(group, dev);
4215
4216 iommu_group_put(group);
4217 return ret;
4218 }
4219
4220 static void intel_iommu_remove_device(struct device *dev)
4221 {
4222 iommu_group_remove_device(dev);
4223 }
4224
4225 static struct iommu_ops intel_iommu_ops = {
4226 .domain_init = intel_iommu_domain_init,
4227 .domain_destroy = intel_iommu_domain_destroy,
4228 .attach_dev = intel_iommu_attach_device,
4229 .detach_dev = intel_iommu_detach_device,
4230 .map = intel_iommu_map,
4231 .unmap = intel_iommu_unmap,
4232 .iova_to_phys = intel_iommu_iova_to_phys,
4233 .domain_has_cap = intel_iommu_domain_has_cap,
4234 .add_device = intel_iommu_add_device,
4235 .remove_device = intel_iommu_remove_device,
4236 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
4237 };
4238
4239 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4240 {
4241 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4242 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4243 dmar_map_gfx = 0;
4244 }
4245
4246 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4247 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4248 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4249 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4250 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4251 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4252 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4253
4254 static void quirk_iommu_rwbf(struct pci_dev *dev)
4255 {
4256 /*
4257 * Mobile 4 Series Chipset neglects to set RWBF capability,
4258 * but needs it. Same seems to hold for the desktop versions.
4259 */
4260 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4261 rwbf_quirk = 1;
4262 }
4263
4264 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4265 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4266 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4267 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4268 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4269 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4270 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4271
4272 #define GGC 0x52
4273 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4274 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4275 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4276 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4277 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4278 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4279 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4280 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4281
4282 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4283 {
4284 unsigned short ggc;
4285
4286 if (pci_read_config_word(dev, GGC, &ggc))
4287 return;
4288
4289 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4290 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4291 dmar_map_gfx = 0;
4292 } else if (dmar_map_gfx) {
4293 /* we have to ensure the gfx device is idle before we flush */
4294 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4295 intel_iommu_strict = 1;
4296 }
4297 }
4298 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4299 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4300 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4301 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4302
4303 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4304 ISOCH DMAR unit for the Azalia sound device, but not give it any
4305 TLB entries, which causes it to deadlock. Check for that. We do
4306 this in a function called from init_dmars(), instead of in a PCI
4307 quirk, because we don't want to print the obnoxious "BIOS broken"
4308 message if VT-d is actually disabled.
4309 */
4310 static void __init check_tylersburg_isoch(void)
4311 {
4312 struct pci_dev *pdev;
4313 uint32_t vtisochctrl;
4314
4315 /* If there's no Azalia in the system anyway, forget it. */
4316 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4317 if (!pdev)
4318 return;
4319 pci_dev_put(pdev);
4320
4321 /* System Management Registers. Might be hidden, in which case
4322 we can't do the sanity check. But that's OK, because the
4323 known-broken BIOSes _don't_ actually hide it, so far. */
4324 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4325 if (!pdev)
4326 return;
4327
4328 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4329 pci_dev_put(pdev);
4330 return;
4331 }
4332
4333 pci_dev_put(pdev);
4334
4335 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4336 if (vtisochctrl & 1)
4337 return;
4338
4339 /* Drop all bits other than the number of TLB entries */
4340 vtisochctrl &= 0x1c;
4341
4342 /* If we have the recommended number of TLB entries (16), fine. */
4343 if (vtisochctrl == 0x10)
4344 return;
4345
4346 /* Zero TLB entries? You get to ride the short bus to school. */
4347 if (!vtisochctrl) {
4348 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4349 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4350 dmi_get_system_info(DMI_BIOS_VENDOR),
4351 dmi_get_system_info(DMI_BIOS_VERSION),
4352 dmi_get_system_info(DMI_PRODUCT_VERSION));
4353 iommu_identity_mapping |= IDENTMAP_AZALIA;
4354 return;
4355 }
4356
4357 printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4358 vtisochctrl);
4359 }