@@ -32,6 +32,38 @@ static bool engine_supports_migration(struct intel_engine_cs *engine)
3232 return true;
3333}
3434
35+ static void xehpsdv_toggle_pdes (struct i915_address_space * vm ,
36+ struct i915_page_table * pt ,
37+ void * data )
38+ {
39+ struct insert_pte_data * d = data ;
40+
41+ /*
42+ * Insert a dummy PTE into every PT that will map to LMEM to ensure
43+ * we have a correctly setup PDE structure for later use.
44+ */
45+ vm -> insert_page (vm , 0 , d -> offset , I915_CACHE_NONE , PTE_LM );
46+ GEM_BUG_ON (!pt -> is_compact );
47+ d -> offset += SZ_2M ;
48+ }
49+
50+ static void xehpsdv_insert_pte (struct i915_address_space * vm ,
51+ struct i915_page_table * pt ,
52+ void * data )
53+ {
54+ struct insert_pte_data * d = data ;
55+
56+ /*
57+ * We are playing tricks here, since the actual pt, from the hw
58+ * pov, is only 256bytes with 32 entries, or 4096bytes with 512
59+ * entries, but we are still guaranteed that the physical
60+ * alignment is 64K underneath for the pt, and we are careful
61+ * not to access the space in the void.
62+ */
63+ vm -> insert_page (vm , px_dma (pt ), d -> offset , I915_CACHE_NONE , PTE_LM );
64+ d -> offset += SZ_64K ;
65+ }
66+
3567static void insert_pte (struct i915_address_space * vm ,
3668 struct i915_page_table * pt ,
3769 void * data )
@@ -74,7 +106,32 @@ static struct i915_address_space *migrate_vm(struct intel_gt *gt)
74106 * i.e. within the same non-preemptible window so that we do not switch
75107 * to another migration context that overwrites the PTE.
76108 *
77- * TODO: Add support for huge LMEM PTEs
109+ * This changes quite a bit on platforms with HAS_64K_PAGES support,
110+ * where we instead have three windows, each CHUNK_SIZE in size. The
111+ * first is reserved for mapping system-memory, and that just uses the
112+ * 512 entry layout using 4K GTT pages. The other two windows just map
113+ * lmem pages and must use the new compact 32 entry layout using 64K GTT
114+ * pages, which ensures we can address any lmem object that the user
115+ * throws at us. We then also use the xehpsdv_toggle_pdes as a way of
116+ * just toggling the PDE bit(GEN12_PDE_64K) for us, to enable the
117+ * compact layout for each of these page-tables, that fall within the
118+ * [CHUNK_SIZE, 3 * CHUNK_SIZE) range.
119+ *
120+ * We lay the ppGTT out as:
121+ *
122+ * [0, CHUNK_SZ) -> first window/object, maps smem
123+ * [CHUNK_SZ, 2 * CHUNK_SZ) -> second window/object, maps lmem src
124+ * [2 * CHUNK_SZ, 3 * CHUNK_SZ) -> third window/object, maps lmem dst
125+ *
126+ * For the PTE window it's also quite different, since each PTE must
127+ * point to some 64K page, one for each PT(since it's in lmem), and yet
128+ * each is only <= 4096bytes, but since the unused space within that PTE
129+ * range is never touched, this should be fine.
130+ *
131+ * So basically each PT now needs 64K of virtual memory, instead of 4K,
132+ * which looks like:
133+ *
134+ * [3 * CHUNK_SZ, 3 * CHUNK_SZ + ((3 * CHUNK_SZ / SZ_2M) * SZ_64K)] -> PTE
78135 */
79136
80137 vm = i915_ppgtt_create (gt , I915_BO_ALLOC_PM_EARLY );
@@ -86,6 +143,9 @@ static struct i915_address_space *migrate_vm(struct intel_gt *gt)
86143 goto err_vm ;
87144 }
88145
146+ if (HAS_64K_PAGES (gt -> i915 ))
147+ stash .pt_sz = I915_GTT_PAGE_SIZE_64K ;
148+
89149 /*
90150 * Each engine instance is assigned its own chunk in the VM, so
91151 * that we can run multiple instances concurrently
@@ -105,14 +165,20 @@ static struct i915_address_space *migrate_vm(struct intel_gt *gt)
105165 * We copy in 8MiB chunks. Each PDE covers 2MiB, so we need
106166 * 4x2 page directories for source/destination.
107167 */
108- sz = 2 * CHUNK_SZ ;
168+ if (HAS_64K_PAGES (gt -> i915 ))
169+ sz = 3 * CHUNK_SZ ;
170+ else
171+ sz = 2 * CHUNK_SZ ;
109172 d .offset = base + sz ;
110173
111174 /*
112175 * We need another page directory setup so that we can write
113176 * the 8x512 PTE in each chunk.
114177 */
115- sz += (sz >> 12 ) * sizeof (u64 );
178+ if (HAS_64K_PAGES (gt -> i915 ))
179+ sz += (sz / SZ_2M ) * SZ_64K ;
180+ else
181+ sz += (sz >> 12 ) * sizeof (u64 );
116182
117183 err = i915_vm_alloc_pt_stash (& vm -> vm , & stash , sz );
118184 if (err )
@@ -133,7 +199,18 @@ static struct i915_address_space *migrate_vm(struct intel_gt *gt)
133199 goto err_vm ;
134200
135201 /* Now allow the GPU to rewrite the PTE via its own ppGTT */
136- vm -> vm .foreach (& vm -> vm , base , d .offset - base , insert_pte , & d );
202+ if (HAS_64K_PAGES (gt -> i915 )) {
203+ vm -> vm .foreach (& vm -> vm , base , d .offset - base ,
204+ xehpsdv_insert_pte , & d );
205+ d .offset = base + CHUNK_SZ ;
206+ vm -> vm .foreach (& vm -> vm ,
207+ d .offset ,
208+ 2 * CHUNK_SZ ,
209+ xehpsdv_toggle_pdes , & d );
210+ } else {
211+ vm -> vm .foreach (& vm -> vm , base , d .offset - base ,
212+ insert_pte , & d );
213+ }
137214 }
138215
139216 return & vm -> vm ;
@@ -269,27 +346,46 @@ static int emit_pte(struct i915_request *rq,
269346 u64 offset ,
270347 int length )
271348{
349+ bool has_64K_pages = HAS_64K_PAGES (rq -> engine -> i915 );
272350 const u64 encode = rq -> context -> vm -> pte_encode (0 , cache_level ,
273351 is_lmem ? PTE_LM : 0 );
274352 struct intel_ring * ring = rq -> ring ;
275- int total = 0 ;
353+ int pkt , dword_length ;
354+ u32 total = 0 ;
355+ u32 page_size ;
276356 u32 * hdr , * cs ;
277- int pkt ;
278357
279358 GEM_BUG_ON (GRAPHICS_VER (rq -> engine -> i915 ) < 8 );
280359
360+ page_size = I915_GTT_PAGE_SIZE ;
361+ dword_length = 0x400 ;
362+
281363 /* Compute the page directory offset for the target address range */
282- offset >>= 12 ;
283- offset *= sizeof (u64 );
284- offset += 2 * CHUNK_SZ ;
364+ if (has_64K_pages ) {
365+ GEM_BUG_ON (!IS_ALIGNED (offset , SZ_2M ));
366+
367+ offset /= SZ_2M ;
368+ offset *= SZ_64K ;
369+ offset += 3 * CHUNK_SZ ;
370+
371+ if (is_lmem ) {
372+ page_size = I915_GTT_PAGE_SIZE_64K ;
373+ dword_length = 0x40 ;
374+ }
375+ } else {
376+ offset >>= 12 ;
377+ offset *= sizeof (u64 );
378+ offset += 2 * CHUNK_SZ ;
379+ }
380+
285381 offset += (u64 )rq -> engine -> instance << 32 ;
286382
287383 cs = intel_ring_begin (rq , 6 );
288384 if (IS_ERR (cs ))
289385 return PTR_ERR (cs );
290386
291387 /* Pack as many PTE updates as possible into a single MI command */
292- pkt = min_t (int , 0x400 , ring -> space / sizeof (u32 ) + 5 );
388+ pkt = min_t (int , dword_length , ring -> space / sizeof (u32 ) + 5 );
293389 pkt = min_t (int , pkt , (ring -> size - ring -> emit ) / sizeof (u32 ) + 5 );
294390
295391 hdr = cs ;
@@ -299,6 +395,8 @@ static int emit_pte(struct i915_request *rq,
299395
300396 do {
301397 if (cs - hdr >= pkt ) {
398+ int dword_rem ;
399+
302400 * hdr += cs - hdr - 2 ;
303401 * cs ++ = MI_NOOP ;
304402
@@ -310,7 +408,18 @@ static int emit_pte(struct i915_request *rq,
310408 if (IS_ERR (cs ))
311409 return PTR_ERR (cs );
312410
313- pkt = min_t (int , 0x400 , ring -> space / sizeof (u32 ) + 5 );
411+ dword_rem = dword_length ;
412+ if (has_64K_pages ) {
413+ if (IS_ALIGNED (total , SZ_2M )) {
414+ offset = round_up (offset , SZ_64K );
415+ } else {
416+ dword_rem = SZ_2M - (total & (SZ_2M - 1 ));
417+ dword_rem /= page_size ;
418+ dword_rem *= 2 ;
419+ }
420+ }
421+
422+ pkt = min_t (int , dword_rem , ring -> space / sizeof (u32 ) + 5 );
314423 pkt = min_t (int , pkt , (ring -> size - ring -> emit ) / sizeof (u32 ) + 5 );
315424
316425 hdr = cs ;
@@ -319,13 +428,15 @@ static int emit_pte(struct i915_request *rq,
319428 * cs ++ = upper_32_bits (offset );
320429 }
321430
431+ GEM_BUG_ON (!IS_ALIGNED (it -> dma , page_size ));
432+
322433 * cs ++ = lower_32_bits (encode | it -> dma );
323434 * cs ++ = upper_32_bits (encode | it -> dma );
324435
325436 offset += 8 ;
326- total += I915_GTT_PAGE_SIZE ;
437+ total += page_size ;
327438
328- it -> dma += I915_GTT_PAGE_SIZE ;
439+ it -> dma += page_size ;
329440 if (it -> dma >= it -> max ) {
330441 it -> sg = __sg_next (it -> sg );
331442 if (!it -> sg || sg_dma_len (it -> sg ) == 0 )
@@ -356,7 +467,8 @@ static bool wa_1209644611_applies(int ver, u32 size)
356467 return height % 4 == 3 && height <= 8 ;
357468}
358469
359- static int emit_copy (struct i915_request * rq , int size )
470+ static int emit_copy (struct i915_request * rq ,
471+ u32 dst_offset , u32 src_offset , int size )
360472{
361473 const int ver = GRAPHICS_VER (rq -> engine -> i915 );
362474 u32 instance = rq -> engine -> instance ;
@@ -371,31 +483,31 @@ static int emit_copy(struct i915_request *rq, int size)
371483 * cs ++ = BLT_DEPTH_32 | PAGE_SIZE ;
372484 * cs ++ = 0 ;
373485 * cs ++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4 ;
374- * cs ++ = CHUNK_SZ ; /* dst offset */
486+ * cs ++ = dst_offset ;
375487 * cs ++ = instance ;
376488 * cs ++ = 0 ;
377489 * cs ++ = PAGE_SIZE ;
378- * cs ++ = 0 ; /* src offset */
490+ * cs ++ = src_offset ;
379491 * cs ++ = instance ;
380492 } else if (ver >= 8 ) {
381493 * cs ++ = XY_SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (10 - 2 );
382494 * cs ++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE ;
383495 * cs ++ = 0 ;
384496 * cs ++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4 ;
385- * cs ++ = CHUNK_SZ ; /* dst offset */
497+ * cs ++ = dst_offset ;
386498 * cs ++ = instance ;
387499 * cs ++ = 0 ;
388500 * cs ++ = PAGE_SIZE ;
389- * cs ++ = 0 ; /* src offset */
501+ * cs ++ = src_offset ;
390502 * cs ++ = instance ;
391503 } else {
392504 GEM_BUG_ON (instance );
393505 * cs ++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2 );
394506 * cs ++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE ;
395507 * cs ++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE ;
396- * cs ++ = CHUNK_SZ ; /* dst offset */
508+ * cs ++ = dst_offset ;
397509 * cs ++ = PAGE_SIZE ;
398- * cs ++ = 0 ; /* src offset */
510+ * cs ++ = src_offset ;
399511 }
400512
401513 intel_ring_advance (rq , cs );
@@ -423,6 +535,7 @@ intel_context_migrate_copy(struct intel_context *ce,
423535 GEM_BUG_ON (ce -> ring -> size < SZ_64K );
424536
425537 do {
538+ u32 src_offset , dst_offset ;
426539 int len ;
427540
428541 rq = i915_request_create (ce );
@@ -450,15 +563,28 @@ intel_context_migrate_copy(struct intel_context *ce,
450563 if (err )
451564 goto out_rq ;
452565
453- len = emit_pte (rq , & it_src , src_cache_level , src_is_lmem , 0 ,
454- CHUNK_SZ );
566+ src_offset = 0 ;
567+ dst_offset = CHUNK_SZ ;
568+ if (HAS_64K_PAGES (ce -> engine -> i915 )) {
569+ GEM_BUG_ON (!src_is_lmem && !dst_is_lmem );
570+
571+ src_offset = 0 ;
572+ dst_offset = 0 ;
573+ if (src_is_lmem )
574+ src_offset = CHUNK_SZ ;
575+ if (dst_is_lmem )
576+ dst_offset = 2 * CHUNK_SZ ;
577+ }
578+
579+ len = emit_pte (rq , & it_src , src_cache_level , src_is_lmem ,
580+ src_offset , CHUNK_SZ );
455581 if (len <= 0 ) {
456582 err = len ;
457583 goto out_rq ;
458584 }
459585
460586 err = emit_pte (rq , & it_dst , dst_cache_level , dst_is_lmem ,
461- CHUNK_SZ , len );
587+ dst_offset , len );
462588 if (err < 0 )
463589 goto out_rq ;
464590 if (err < len ) {
@@ -470,7 +596,7 @@ intel_context_migrate_copy(struct intel_context *ce,
470596 if (err )
471597 goto out_rq ;
472598
473- err = emit_copy (rq , len );
599+ err = emit_copy (rq , dst_offset , src_offset , len );
474600
475601 /* Arbitration is re-enabled between requests. */
476602out_rq :
@@ -488,14 +614,15 @@ intel_context_migrate_copy(struct intel_context *ce,
488614 return err ;
489615}
490616
491- static int emit_clear (struct i915_request * rq , int size , u32 value )
617+ static int emit_clear (struct i915_request * rq , u64 offset , int size , u32 value )
492618{
493619 const int ver = GRAPHICS_VER (rq -> engine -> i915 );
494- u32 instance = rq -> engine -> instance ;
495620 u32 * cs ;
496621
497622 GEM_BUG_ON (size >> PAGE_SHIFT > S16_MAX );
498623
624+ offset += (u64 )rq -> engine -> instance << 32 ;
625+
499626 cs = intel_ring_begin (rq , ver >= 8 ? 8 : 6 );
500627 if (IS_ERR (cs ))
501628 return PTR_ERR (cs );
@@ -505,17 +632,17 @@ static int emit_clear(struct i915_request *rq, int size, u32 value)
505632 * cs ++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE ;
506633 * cs ++ = 0 ;
507634 * cs ++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4 ;
508- * cs ++ = 0 ; /* offset */
509- * cs ++ = instance ;
635+ * cs ++ = lower_32_bits ( offset );
636+ * cs ++ = upper_32_bits ( offset ) ;
510637 * cs ++ = value ;
511638 * cs ++ = MI_NOOP ;
512639 } else {
513- GEM_BUG_ON (instance );
640+ GEM_BUG_ON (upper_32_bits ( offset ) );
514641 * cs ++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (6 - 2 );
515642 * cs ++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE ;
516643 * cs ++ = 0 ;
517644 * cs ++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4 ;
518- * cs ++ = 0 ;
645+ * cs ++ = lower_32_bits ( offset ) ;
519646 * cs ++ = value ;
520647 }
521648
@@ -542,6 +669,7 @@ intel_context_migrate_clear(struct intel_context *ce,
542669 GEM_BUG_ON (ce -> ring -> size < SZ_64K );
543670
544671 do {
672+ u32 offset ;
545673 int len ;
546674
547675 rq = i915_request_create (ce );
@@ -569,7 +697,11 @@ intel_context_migrate_clear(struct intel_context *ce,
569697 if (err )
570698 goto out_rq ;
571699
572- len = emit_pte (rq , & it , cache_level , is_lmem , 0 , CHUNK_SZ );
700+ offset = 0 ;
701+ if (HAS_64K_PAGES (ce -> engine -> i915 ) && is_lmem )
702+ offset = CHUNK_SZ ;
703+
704+ len = emit_pte (rq , & it , cache_level , is_lmem , offset , CHUNK_SZ );
573705 if (len <= 0 ) {
574706 err = len ;
575707 goto out_rq ;
@@ -579,7 +711,7 @@ intel_context_migrate_clear(struct intel_context *ce,
579711 if (err )
580712 goto out_rq ;
581713
582- err = emit_clear (rq , len , value );
714+ err = emit_clear (rq , offset , len , value );
583715
584716 /* Arbitration is re-enabled between requests. */
585717out_rq :
0 commit comments