4 * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
20 #include <linux/delay.h> /* for udelay */
21 #include <linux/mm.h> /* for totalram_pages */
22 #include <linux/scatterlist.h>
23 #include <linux/tegra-soc.h>
24 #include <linux/nvhost_dbg_gpu_ioctl.h>
25 #include <linux/vmalloc.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/firmware.h>
28 #include <linux/nvhost.h>
31 #include "kind_gk20a.h"
32 #include "gr_ctx_gk20a.h"
34 #include "hw_ccsr_gk20a.h"
35 #include "hw_ctxsw_prog_gk20a.h"
36 #include "hw_fifo_gk20a.h"
37 #include "hw_gr_gk20a.h"
38 #include "hw_gmmu_gk20a.h"
39 #include "hw_mc_gk20a.h"
40 #include "hw_ram_gk20a.h"
41 #include "hw_pri_ringmaster_gk20a.h"
42 #include "hw_pri_ringstation_sys_gk20a.h"
43 #include "hw_pri_ringstation_gpc_gk20a.h"
44 #include "hw_pri_ringstation_fbp_gk20a.h"
45 #include "hw_proj_gk20a.h"
46 #include "hw_top_gk20a.h"
47 #include "hw_ltc_gk20a.h"
48 #include "hw_fb_gk20a.h"
49 #include "hw_therm_gk20a.h"
50 #include "hw_pbdma_gk20a.h"
51 #include "gr_pri_gk20a.h"
52 #include "regops_gk20a.h"
53 #include "dbg_gpu_gk20a.h"
55 #define BLK_SIZE (256)
57 static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va);
59 /* global ctx buffer */
60 static int gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g);
61 static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g);
62 static int gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
63 struct channel_gk20a *c);
64 static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c);
66 /* channel gr ctx buffer */
67 static int gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
68 struct channel_gk20a *c);
69 static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c);
71 /* channel patch ctx buffer */
72 static int gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
73 struct channel_gk20a *c);
74 static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c);
76 /* golden ctx image */
77 static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
78 struct channel_gk20a *c);
79 static int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
80 struct channel_gk20a *c);
82 void gk20a_fecs_dump_falcon_stats(struct gk20a *g)
86 gk20a_err(dev_from_gk20a(g), "gr_fecs_os_r : %d",
87 gk20a_readl(g, gr_fecs_os_r()));
88 gk20a_err(dev_from_gk20a(g), "gr_fecs_cpuctl_r : 0x%x",
89 gk20a_readl(g, gr_fecs_cpuctl_r()));
90 gk20a_err(dev_from_gk20a(g), "gr_fecs_idlestate_r : 0x%x",
91 gk20a_readl(g, gr_fecs_idlestate_r()));
92 gk20a_err(dev_from_gk20a(g), "gr_fecs_mailbox0_r : 0x%x",
93 gk20a_readl(g, gr_fecs_mailbox0_r()));
94 gk20a_err(dev_from_gk20a(g), "gr_fecs_mailbox1_r : 0x%x",
95 gk20a_readl(g, gr_fecs_mailbox1_r()));
96 gk20a_err(dev_from_gk20a(g), "gr_fecs_irqstat_r : 0x%x",
97 gk20a_readl(g, gr_fecs_irqstat_r()));
98 gk20a_err(dev_from_gk20a(g), "gr_fecs_irqmode_r : 0x%x",
99 gk20a_readl(g, gr_fecs_irqmode_r()));
100 gk20a_err(dev_from_gk20a(g), "gr_fecs_irqmask_r : 0x%x",
101 gk20a_readl(g, gr_fecs_irqmask_r()));
102 gk20a_err(dev_from_gk20a(g), "gr_fecs_irqdest_r : 0x%x",
103 gk20a_readl(g, gr_fecs_irqdest_r()));
104 gk20a_err(dev_from_gk20a(g), "gr_fecs_debug1_r : 0x%x",
105 gk20a_readl(g, gr_fecs_debug1_r()));
106 gk20a_err(dev_from_gk20a(g), "gr_fecs_debuginfo_r : 0x%x",
107 gk20a_readl(g, gr_fecs_debuginfo_r()));
109 for (i = 0; i < gr_fecs_ctxsw_mailbox__size_1_v(); i++)
110 gk20a_err(dev_from_gk20a(g), "gr_fecs_ctxsw_mailbox_r(%d) : 0x%x",
111 i, gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(i)));
113 gk20a_err(dev_from_gk20a(g), "gr_fecs_engctl_r : 0x%x",
114 gk20a_readl(g, gr_fecs_engctl_r()));
115 gk20a_err(dev_from_gk20a(g), "gr_fecs_curctx_r : 0x%x",
116 gk20a_readl(g, gr_fecs_curctx_r()));
117 gk20a_err(dev_from_gk20a(g), "gr_fecs_nxtctx_r : 0x%x",
118 gk20a_readl(g, gr_fecs_nxtctx_r()));
120 gk20a_writel(g, gr_fecs_icd_cmd_r(),
121 gr_fecs_icd_cmd_opc_rreg_f() |
122 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_IMB));
123 gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_IMB : 0x%x",
124 gk20a_readl(g, gr_fecs_icd_rdata_r()));
126 gk20a_writel(g, gr_fecs_icd_cmd_r(),
127 gr_fecs_icd_cmd_opc_rreg_f() |
128 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_DMB));
129 gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_DMB : 0x%x",
130 gk20a_readl(g, gr_fecs_icd_rdata_r()));
132 gk20a_writel(g, gr_fecs_icd_cmd_r(),
133 gr_fecs_icd_cmd_opc_rreg_f() |
134 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CSW));
135 gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_CSW : 0x%x",
136 gk20a_readl(g, gr_fecs_icd_rdata_r()));
138 gk20a_writel(g, gr_fecs_icd_cmd_r(),
139 gr_fecs_icd_cmd_opc_rreg_f() |
140 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CTX));
141 gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_CTX : 0x%x",
142 gk20a_readl(g, gr_fecs_icd_rdata_r()));
144 gk20a_writel(g, gr_fecs_icd_cmd_r(),
145 gr_fecs_icd_cmd_opc_rreg_f() |
146 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_EXCI));
147 gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_EXCI : 0x%x",
148 gk20a_readl(g, gr_fecs_icd_rdata_r()));
150 for (i = 0; i < 4; i++) {
151 gk20a_writel(g, gr_fecs_icd_cmd_r(),
152 gr_fecs_icd_cmd_opc_rreg_f() |
153 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_PC));
154 gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_PC : 0x%x",
155 gk20a_readl(g, gr_fecs_icd_rdata_r()));
157 gk20a_writel(g, gr_fecs_icd_cmd_r(),
158 gr_fecs_icd_cmd_opc_rreg_f() |
159 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_SP));
160 gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_SP : 0x%x",
161 gk20a_readl(g, gr_fecs_icd_rdata_r()));
165 static void gr_gk20a_load_falcon_dmem(struct gk20a *g)
167 u32 i, ucode_u32_size;
168 const u32 *ucode_u32_data;
173 gk20a_writel(g, gr_gpccs_dmemc_r(0), (gr_gpccs_dmemc_offs_f(0) |
174 gr_gpccs_dmemc_blk_f(0) |
175 gr_gpccs_dmemc_aincw_f(1)));
177 ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.data.count;
178 ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.data.l;
180 for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
181 gk20a_writel(g, gr_gpccs_dmemd_r(0), ucode_u32_data[i]);
182 checksum += ucode_u32_data[i];
185 gk20a_writel(g, gr_fecs_dmemc_r(0), (gr_fecs_dmemc_offs_f(0) |
186 gr_fecs_dmemc_blk_f(0) |
187 gr_fecs_dmemc_aincw_f(1)));
189 ucode_u32_size = g->gr.ctx_vars.ucode.fecs.data.count;
190 ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.data.l;
192 for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
193 gk20a_writel(g, gr_fecs_dmemd_r(0), ucode_u32_data[i]);
194 checksum += ucode_u32_data[i];
196 gk20a_dbg_fn("done");
199 static void gr_gk20a_load_falcon_imem(struct gk20a *g)
201 u32 cfg, fecs_imem_size, gpccs_imem_size, ucode_u32_size;
202 const u32 *ucode_u32_data;
203 u32 tag, i, pad_start, pad_end;
208 cfg = gk20a_readl(g, gr_fecs_cfg_r());
209 fecs_imem_size = gr_fecs_cfg_imem_sz_v(cfg);
211 cfg = gk20a_readl(g, gr_gpc0_cfg_r());
212 gpccs_imem_size = gr_gpc0_cfg_imem_sz_v(cfg);
214 /* Use the broadcast address to access all of the GPCCS units. */
215 gk20a_writel(g, gr_gpccs_imemc_r(0), (gr_gpccs_imemc_offs_f(0) |
216 gr_gpccs_imemc_blk_f(0) |
217 gr_gpccs_imemc_aincw_f(1)));
219 /* Setup the tags for the instruction memory. */
221 gk20a_writel(g, gr_gpccs_imemt_r(0), gr_gpccs_imemt_tag_f(tag));
223 ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.inst.count;
224 ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.inst.l;
226 for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
227 if (i && ((i % (256/sizeof(u32))) == 0)) {
229 gk20a_writel(g, gr_gpccs_imemt_r(0),
230 gr_gpccs_imemt_tag_f(tag));
232 gk20a_writel(g, gr_gpccs_imemd_r(0), ucode_u32_data[i]);
233 checksum += ucode_u32_data[i];
237 pad_end = pad_start+(256-pad_start%256)+256;
239 (i < gpccs_imem_size * 256) && (i < pad_end);
241 if (i && ((i % 256) == 0)) {
243 gk20a_writel(g, gr_gpccs_imemt_r(0),
244 gr_gpccs_imemt_tag_f(tag));
246 gk20a_writel(g, gr_gpccs_imemd_r(0), 0);
249 gk20a_writel(g, gr_fecs_imemc_r(0), (gr_fecs_imemc_offs_f(0) |
250 gr_fecs_imemc_blk_f(0) |
251 gr_fecs_imemc_aincw_f(1)));
253 /* Setup the tags for the instruction memory. */
255 gk20a_writel(g, gr_fecs_imemt_r(0), gr_fecs_imemt_tag_f(tag));
257 ucode_u32_size = g->gr.ctx_vars.ucode.fecs.inst.count;
258 ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.inst.l;
260 for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
261 if (i && ((i % (256/sizeof(u32))) == 0)) {
263 gk20a_writel(g, gr_fecs_imemt_r(0),
264 gr_fecs_imemt_tag_f(tag));
266 gk20a_writel(g, gr_fecs_imemd_r(0), ucode_u32_data[i]);
267 checksum += ucode_u32_data[i];
271 pad_end = pad_start+(256-pad_start%256)+256;
272 for (i = pad_start; (i < fecs_imem_size * 256) && i < pad_end; i += 4) {
273 if (i && ((i % 256) == 0)) {
275 gk20a_writel(g, gr_fecs_imemt_r(0),
276 gr_fecs_imemt_tag_f(tag));
278 gk20a_writel(g, gr_fecs_imemd_r(0), 0);
282 static int gr_gk20a_wait_idle(struct gk20a *g, unsigned long end_jiffies,
285 u32 delay = expect_delay;
293 /* fmodel: host gets fifo_engine_status(gr) from gr
294 only when gr_status is read */
295 gk20a_readl(g, gr_status_r());
297 gr_enabled = gk20a_readl(g, mc_enable_r()) &
298 mc_enable_pgraph_enabled_f();
300 ctxsw_active = gk20a_readl(g,
301 fifo_engine_status_r(ENGINE_GR_GK20A)) &
302 fifo_engine_status_ctxsw_in_progress_f();
304 gr_busy = gk20a_readl(g, gr_engine_status_r()) &
305 gr_engine_status_value_busy_f();
307 if (!gr_enabled || (!gr_busy && !ctxsw_active)) {
308 gk20a_dbg_fn("done");
312 usleep_range(delay, delay * 2);
313 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
315 } while (time_before(jiffies, end_jiffies)
316 || !tegra_platform_is_silicon());
318 gk20a_err(dev_from_gk20a(g),
319 "timeout, ctxsw busy : %d, gr busy : %d",
320 ctxsw_active, gr_busy);
325 static int gr_gk20a_wait_fe_idle(struct gk20a *g, unsigned long end_jiffies,
329 u32 delay = expect_delay;
334 val = gk20a_readl(g, gr_status_r());
336 if (!gr_status_fe_method_upper_v(val) &&
337 !gr_status_fe_method_lower_v(val) &&
338 !gr_status_fe_method_fe_gi_v(val)) {
339 gk20a_dbg_fn("done");
343 usleep_range(delay, delay * 2);
344 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
345 } while (time_before(jiffies, end_jiffies)
346 || !tegra_platform_is_silicon());
348 gk20a_err(dev_from_gk20a(g),
349 "timeout, fe busy : %x", val);
353 static int gr_gk20a_ctx_reset(struct gk20a *g, u32 rst_mask)
355 u32 delay = GR_IDLE_CHECK_DEFAULT;
356 unsigned long end_jiffies = jiffies +
357 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
362 if (!tegra_platform_is_linsim()) {
363 /* Force clocks on */
364 gk20a_writel(g, gr_fe_pwr_mode_r(),
365 gr_fe_pwr_mode_req_send_f() |
366 gr_fe_pwr_mode_mode_force_on_f());
368 /* Wait for the clocks to indicate that they are on */
370 reg = gk20a_readl(g, gr_fe_pwr_mode_r());
372 if (gr_fe_pwr_mode_req_v(reg) ==
373 gr_fe_pwr_mode_req_done_v())
376 usleep_range(delay, delay * 2);
377 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
379 } while (time_before(jiffies, end_jiffies));
381 if (!time_before(jiffies, end_jiffies)) {
382 gk20a_err(dev_from_gk20a(g),
383 "failed to force the clocks on\n");
388 gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(), rst_mask);
390 gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
391 gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
392 gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
393 gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f() |
394 gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
395 gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
396 gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f() |
397 gr_fecs_ctxsw_reset_ctl_sys_context_reset_enabled_f() |
398 gr_fecs_ctxsw_reset_ctl_gpc_context_reset_enabled_f() |
399 gr_fecs_ctxsw_reset_ctl_be_context_reset_enabled_f());
402 /* we need to read the reset register *and* wait for a moment to ensure
403 * reset propagation */
405 gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
408 gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
409 gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
410 gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
411 gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f() |
412 gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
413 gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
414 gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f() |
415 gr_fecs_ctxsw_reset_ctl_sys_context_reset_disabled_f() |
416 gr_fecs_ctxsw_reset_ctl_gpc_context_reset_disabled_f() |
417 gr_fecs_ctxsw_reset_ctl_be_context_reset_disabled_f());
419 /* we need to readl the reset and then wait a small moment after that */
420 gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
423 if (!tegra_platform_is_linsim()) {
424 /* Set power mode back to auto */
425 gk20a_writel(g, gr_fe_pwr_mode_r(),
426 gr_fe_pwr_mode_req_send_f() |
427 gr_fe_pwr_mode_mode_auto_f());
429 /* Wait for the request to complete */
430 end_jiffies = jiffies +
431 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
433 reg = gk20a_readl(g, gr_fe_pwr_mode_r());
435 if (gr_fe_pwr_mode_req_v(reg) ==
436 gr_fe_pwr_mode_req_done_v())
439 usleep_range(delay, delay * 2);
440 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
442 } while (time_before(jiffies, end_jiffies));
444 if (!time_before(jiffies, end_jiffies))
445 gk20a_warn(dev_from_gk20a(g),
446 "failed to set power mode to auto\n");
452 static int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id,
453 u32 *mailbox_ret, u32 opc_success,
454 u32 mailbox_ok, u32 opc_fail,
457 unsigned long end_jiffies = jiffies +
458 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
459 u32 delay = GR_IDLE_CHECK_DEFAULT;
460 u32 check = WAIT_UCODE_LOOP;
465 while (check == WAIT_UCODE_LOOP) {
466 if (!time_before(jiffies, end_jiffies) &&
467 tegra_platform_is_silicon())
468 check = WAIT_UCODE_TIMEOUT;
470 reg = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(mailbox_id));
475 switch (opc_success) {
476 case GR_IS_UCODE_OP_EQUAL:
477 if (reg == mailbox_ok)
478 check = WAIT_UCODE_OK;
480 case GR_IS_UCODE_OP_NOT_EQUAL:
481 if (reg != mailbox_ok)
482 check = WAIT_UCODE_OK;
484 case GR_IS_UCODE_OP_AND:
485 if (reg & mailbox_ok)
486 check = WAIT_UCODE_OK;
488 case GR_IS_UCODE_OP_LESSER:
489 if (reg < mailbox_ok)
490 check = WAIT_UCODE_OK;
492 case GR_IS_UCODE_OP_LESSER_EQUAL:
493 if (reg <= mailbox_ok)
494 check = WAIT_UCODE_OK;
496 case GR_IS_UCODE_OP_SKIP:
497 /* do no success check */
500 gk20a_err(dev_from_gk20a(g),
501 "invalid success opcode 0x%x", opc_success);
503 check = WAIT_UCODE_ERROR;
508 case GR_IS_UCODE_OP_EQUAL:
509 if (reg == mailbox_fail)
510 check = WAIT_UCODE_ERROR;
512 case GR_IS_UCODE_OP_NOT_EQUAL:
513 if (reg != mailbox_fail)
514 check = WAIT_UCODE_ERROR;
516 case GR_IS_UCODE_OP_AND:
517 if (reg & mailbox_fail)
518 check = WAIT_UCODE_ERROR;
520 case GR_IS_UCODE_OP_LESSER:
521 if (reg < mailbox_fail)
522 check = WAIT_UCODE_ERROR;
524 case GR_IS_UCODE_OP_LESSER_EQUAL:
525 if (reg <= mailbox_fail)
526 check = WAIT_UCODE_ERROR;
528 case GR_IS_UCODE_OP_SKIP:
529 /* do no check on fail*/
532 gk20a_err(dev_from_gk20a(g),
533 "invalid fail opcode 0x%x", opc_fail);
534 check = WAIT_UCODE_ERROR;
538 usleep_range(delay, delay * 2);
539 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
542 if (check == WAIT_UCODE_TIMEOUT) {
543 gk20a_err(dev_from_gk20a(g),
544 "timeout waiting on ucode response");
545 gk20a_fecs_dump_falcon_stats(g);
547 } else if (check == WAIT_UCODE_ERROR) {
548 gk20a_err(dev_from_gk20a(g),
549 "ucode method failed on mailbox=%d value=0x%08x",
551 gk20a_fecs_dump_falcon_stats(g);
555 gk20a_dbg_fn("done");
559 /* The following is a less brittle way to call gr_gk20a_submit_fecs_method(...)
560 * We should replace most, if not all, fecs method calls to this instead. */
561 struct fecs_method_op_gk20a {
583 int gr_gk20a_submit_fecs_method_op(struct gk20a *g,
584 struct fecs_method_op_gk20a op)
586 struct gr_gk20a *gr = &g->gr;
589 mutex_lock(&gr->fecs_mutex);
591 if (op.mailbox.id != 0)
592 gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(op.mailbox.id),
595 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
596 gr_fecs_ctxsw_mailbox_clear_value_f(op.mailbox.clr));
598 gk20a_writel(g, gr_fecs_method_data_r(), op.method.data);
599 gk20a_writel(g, gr_fecs_method_push_r(),
600 gr_fecs_method_push_adr_f(op.method.addr));
602 /* op.mb.id == 4 cases require waiting for completion on
603 * for op.mb.id == 0 */
604 if (op.mailbox.id == 4)
607 ret = gr_gk20a_ctx_wait_ucode(g, op.mailbox.id, op.mailbox.ret,
608 op.cond.ok, op.mailbox.ok,
609 op.cond.fail, op.mailbox.fail);
611 mutex_unlock(&gr->fecs_mutex);
616 int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret)
618 return gr_gk20a_submit_fecs_method_op(g,
619 (struct fecs_method_op_gk20a) {
620 .method.addr = fecs_method,
622 .mailbox = { .id = 1, /*sideband?*/
623 .data = ~0, .clr = ~0, .ret = ret,
624 .ok = gr_fecs_ctxsw_mailbox_value_pass_v(),
625 .fail = gr_fecs_ctxsw_mailbox_value_fail_v(), },
626 .cond.ok = GR_IS_UCODE_OP_EQUAL,
627 .cond.fail = GR_IS_UCODE_OP_EQUAL });
630 /* Stop processing (stall) context switches at FECS.
631 * The caller must hold the dbg_sessions_lock, else if mutliple stop methods
632 * are sent to the ucode in sequence, it can get into an undefined state. */
633 int gr_gk20a_disable_ctxsw(struct gk20a *g)
635 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
636 return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_stop_ctxsw_v(), 0);
639 /* Start processing (continue) context switches at FECS */
640 int gr_gk20a_enable_ctxsw(struct gk20a *g)
642 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
643 return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_start_ctxsw_v(), 0);
647 static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
651 void *inst_ptr = NULL;
655 inst_ptr = c->inst_block.cpuva;
659 addr_lo = u64_lo32(gpu_va) >> 12;
660 addr_hi = u64_hi32(gpu_va);
662 gk20a_mem_wr32(inst_ptr, ram_in_gr_wfi_target_w(),
663 ram_in_gr_cs_wfi_f() | ram_in_gr_wfi_mode_virtual_f() |
664 ram_in_gr_wfi_ptr_lo_f(addr_lo));
666 gk20a_mem_wr32(inst_ptr, ram_in_gr_wfi_ptr_hi_w(),
667 ram_in_gr_wfi_ptr_hi_f(addr_hi));
673 * Context state can be written directly or "patched" at times.
674 * So that code can be used in either situation it is written
675 * using a series _ctx_patch_write(..., patch) statements.
676 * However any necessary cpu map/unmap and gpu l2 invalidates
677 * should be minimized (to avoid doing it once per patch write).
678 * Before a sequence of these set up with "_ctx_patch_write_begin"
679 * and close with "_ctx_patch_write_end."
681 int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
682 struct channel_ctx_gk20a *ch_ctx)
684 /* being defensive still... */
685 if (ch_ctx->patch_ctx.cpu_va) {
686 gk20a_err(dev_from_gk20a(g), "nested ctx patch begin?");
690 ch_ctx->patch_ctx.cpu_va = vmap(ch_ctx->patch_ctx.pages,
691 PAGE_ALIGN(ch_ctx->patch_ctx.size) >> PAGE_SHIFT,
692 0, pgprot_dmacoherent(PAGE_KERNEL));
694 if (!ch_ctx->patch_ctx.cpu_va)
700 int gr_gk20a_ctx_patch_write_end(struct gk20a *g,
701 struct channel_ctx_gk20a *ch_ctx)
703 /* being defensive still... */
704 if (!ch_ctx->patch_ctx.cpu_va) {
705 gk20a_err(dev_from_gk20a(g), "dangling ctx patch end?");
709 vunmap(ch_ctx->patch_ctx.cpu_va);
710 ch_ctx->patch_ctx.cpu_va = NULL;
714 int gr_gk20a_ctx_patch_write(struct gk20a *g,
715 struct channel_ctx_gk20a *ch_ctx,
716 u32 addr, u32 data, bool patch)
719 void *patch_ptr = NULL;
720 bool mapped_here = false;
722 BUG_ON(patch != 0 && ch_ctx == NULL);
727 /* we added an optimization prolog, epilog
728 * to get rid of unnecessary maps and l2 invals.
729 * but be defensive still... */
730 if (!ch_ctx->patch_ctx.cpu_va) {
732 gk20a_err(dev_from_gk20a(g),
733 "per-write ctx patch begin?");
734 /* yes, gr_gk20a_ctx_patch_smpc causes this one */
735 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
742 patch_ptr = ch_ctx->patch_ctx.cpu_va;
743 patch_slot = ch_ctx->patch_ctx.data_count * 2;
745 gk20a_mem_wr32(patch_ptr, patch_slot++, addr);
746 gk20a_mem_wr32(patch_ptr, patch_slot++, data);
748 ch_ctx->patch_ctx.data_count++;
751 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
754 gk20a_writel(g, addr, data);
759 static int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
760 struct channel_gk20a *c)
762 u32 inst_base_ptr = u64_lo32(c->inst_block.cpu_pa
763 >> ram_in_base_shift_v());
766 gk20a_dbg_info("bind channel %d inst ptr 0x%08x",
767 c->hw_chid, inst_base_ptr);
769 ret = gr_gk20a_submit_fecs_method_op(g,
770 (struct fecs_method_op_gk20a) {
771 .method.addr = gr_fecs_method_push_adr_bind_pointer_v(),
772 .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
773 gr_fecs_current_ctx_target_vid_mem_f() |
774 gr_fecs_current_ctx_valid_f(1)),
775 .mailbox = { .id = 0, .data = 0,
780 .cond.ok = GR_IS_UCODE_OP_AND,
781 .cond.fail = GR_IS_UCODE_OP_AND});
783 gk20a_err(dev_from_gk20a(g),
784 "bind channel instance failed");
789 static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c,
792 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
793 struct fifo_gk20a *f = &g->fifo;
794 struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
795 u32 va_lo, va_hi, va;
797 void *ctx_ptr = NULL;
801 ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
802 PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
803 0, pgprot_dmacoherent(PAGE_KERNEL));
807 if (ch_ctx->zcull_ctx.gpu_va == 0 &&
808 ch_ctx->zcull_ctx.ctx_sw_mode ==
809 ctxsw_prog_main_image_zcull_mode_separate_buffer_v()) {
814 va_lo = u64_lo32(ch_ctx->zcull_ctx.gpu_va);
815 va_hi = u64_hi32(ch_ctx->zcull_ctx.gpu_va);
816 va = ((va_lo >> 8) & 0x00FFFFFF) | ((va_hi << 24) & 0xFF000000);
819 ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
821 gk20a_err(dev_from_gk20a(g),
822 "failed to disable gr engine activity\n");
827 gk20a_mm_fb_flush(g);
829 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_o(), 0,
830 ch_ctx->zcull_ctx.ctx_sw_mode);
832 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, va);
835 ret = gk20a_fifo_enable_engine_activity(g, gr_info);
837 gk20a_err(dev_from_gk20a(g),
838 "failed to enable gr engine activity\n");
849 static int gr_gk20a_commit_global_cb_manager(struct gk20a *g,
850 struct channel_gk20a *c, bool patch)
852 struct gr_gk20a *gr = &g->gr;
853 struct channel_ctx_gk20a *ch_ctx = NULL;
854 u32 attrib_offset_in_chunk = 0;
855 u32 alpha_offset_in_chunk = 0;
856 u32 pd_ab_max_output;
857 u32 gpc_index, ppc_index;
859 u32 cbm_cfg_size1, cbm_cfg_size2;
866 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
871 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_tga_constraintlogic_r(),
872 gr_ds_tga_constraintlogic_beta_cbsize_f(gr->attrib_cb_default_size) |
873 gr_ds_tga_constraintlogic_alpha_cbsize_f(gr->alpha_cb_default_size),
876 pd_ab_max_output = (gr->alpha_cb_default_size *
877 gr_gpc0_ppc0_cbm_cfg_size_granularity_v()) /
878 gr_pd_ab_dist_cfg1_max_output_granularity_v();
880 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg1_r(),
881 gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output) |
882 gr_pd_ab_dist_cfg1_max_batches_init_f(), patch);
884 alpha_offset_in_chunk = attrib_offset_in_chunk +
885 gr->tpc_count * gr->attrib_cb_size;
887 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
888 temp = proj_gpc_stride_v() * gpc_index;
889 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
891 cbm_cfg_size1 = gr->attrib_cb_default_size *
892 gr->pes_tpc_count[ppc_index][gpc_index];
893 cbm_cfg_size2 = gr->alpha_cb_default_size *
894 gr->pes_tpc_count[ppc_index][gpc_index];
896 gr_gk20a_ctx_patch_write(g, ch_ctx,
897 gr_gpc0_ppc0_cbm_cfg_r() + temp +
898 proj_ppc_in_gpc_stride_v() * ppc_index,
899 gr_gpc0_ppc0_cbm_cfg_timeslice_mode_f(gr->timeslice_mode) |
900 gr_gpc0_ppc0_cbm_cfg_start_offset_f(attrib_offset_in_chunk) |
901 gr_gpc0_ppc0_cbm_cfg_size_f(cbm_cfg_size1), patch);
903 attrib_offset_in_chunk += gr->attrib_cb_size *
904 gr->pes_tpc_count[ppc_index][gpc_index];
906 gr_gk20a_ctx_patch_write(g, ch_ctx,
907 gr_gpc0_ppc0_cbm_cfg2_r() + temp +
908 proj_ppc_in_gpc_stride_v() * ppc_index,
909 gr_gpc0_ppc0_cbm_cfg2_start_offset_f(alpha_offset_in_chunk) |
910 gr_gpc0_ppc0_cbm_cfg2_size_f(cbm_cfg_size2), patch);
912 alpha_offset_in_chunk += gr->alpha_cb_size *
913 gr->pes_tpc_count[ppc_index][gpc_index];
918 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
923 static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
924 struct channel_gk20a *c, bool patch)
926 struct gr_gk20a *gr = &g->gr;
927 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
934 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
939 /* global pagepool buffer */
940 addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) >>
941 gr_scc_pagepool_base_addr_39_8_align_bits_v()) |
942 (u64_hi32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) <<
943 (32 - gr_scc_pagepool_base_addr_39_8_align_bits_v()));
945 size = gr->global_ctx_buffer[PAGEPOOL].size /
946 gr_scc_pagepool_total_pages_byte_granularity_v();
948 if (size == gr_scc_pagepool_total_pages_hwmax_value_v())
949 size = gr_scc_pagepool_total_pages_hwmax_v();
951 gk20a_dbg_info("pagepool buffer addr : 0x%016llx, size : %d",
954 g->ops.gr.commit_global_pagepool(g, ch_ctx, addr, size, patch);
956 /* global bundle cb */
957 addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) >>
958 gr_scc_bundle_cb_base_addr_39_8_align_bits_v()) |
959 (u64_hi32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) <<
960 (32 - gr_scc_bundle_cb_base_addr_39_8_align_bits_v()));
962 size = gr->bundle_cb_default_size;
964 gk20a_dbg_info("bundle cb addr : 0x%016llx, size : %d",
967 g->ops.gr.commit_global_bundle_cb(g, ch_ctx, addr, size, patch);
969 /* global attrib cb */
970 addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) >>
971 gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()) |
972 (u64_hi32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) <<
973 (32 - gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()));
975 gk20a_dbg_info("attrib cb addr : 0x%016llx", addr);
976 g->ops.gr.commit_global_attrib_cb(g, ch_ctx, addr, patch);
979 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
984 static void gr_gk20a_commit_global_attrib_cb(struct gk20a *g,
985 struct channel_ctx_gk20a *ch_ctx,
986 u64 addr, bool patch)
988 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_attrib_cb_base_r(),
989 gr_gpcs_setup_attrib_cb_base_addr_39_12_f(addr) |
990 gr_gpcs_setup_attrib_cb_base_valid_true_f(), patch);
992 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_pin_cb_global_base_addr_r(),
993 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_v_f(addr) |
994 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_valid_true_f(), patch);
997 static void gr_gk20a_commit_global_bundle_cb(struct gk20a *g,
998 struct channel_ctx_gk20a *ch_ctx,
999 u64 addr, u64 size, bool patch)
1003 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_base_r(),
1004 gr_scc_bundle_cb_base_addr_39_8_f(addr), patch);
1006 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_size_r(),
1007 gr_scc_bundle_cb_size_div_256b_f(size) |
1008 gr_scc_bundle_cb_size_valid_true_f(), patch);
1010 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_base_r(),
1011 gr_gpcs_setup_bundle_cb_base_addr_39_8_f(addr), patch);
1013 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_size_r(),
1014 gr_gpcs_setup_bundle_cb_size_div_256b_f(size) |
1015 gr_gpcs_setup_bundle_cb_size_valid_true_f(), patch);
1017 /* data for state_limit */
1018 data = (g->gr.bundle_cb_default_size *
1019 gr_scc_bundle_cb_size_div_256b_byte_granularity_v()) /
1020 gr_pd_ab_dist_cfg2_state_limit_scc_bundle_granularity_v();
1022 data = min_t(u32, data, g->gr.min_gpm_fifo_depth);
1024 gk20a_dbg_info("bundle cb token limit : %d, state limit : %d",
1025 g->gr.bundle_cb_token_limit, data);
1027 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg2_r(),
1028 gr_pd_ab_dist_cfg2_token_limit_f(g->gr.bundle_cb_token_limit) |
1029 gr_pd_ab_dist_cfg2_state_limit_f(data), patch);
1033 static int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c, bool patch)
1035 struct gr_gk20a *gr = &g->gr;
1036 struct channel_ctx_gk20a *ch_ctx = NULL;
1038 u32 pd_ab_dist_cfg0;
1046 gpm_pd_cfg = gk20a_readl(g, gr_gpcs_gpm_pd_cfg_r());
1047 pd_ab_dist_cfg0 = gk20a_readl(g, gr_pd_ab_dist_cfg0_r());
1048 ds_debug = gk20a_readl(g, gr_ds_debug_r());
1049 mpc_vtg_debug = gk20a_readl(g, gr_gpcs_tpcs_mpc_vtg_debug_r());
1053 ch_ctx = &c->ch_ctx;
1054 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
1059 if (gr->timeslice_mode == gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v()) {
1060 pe_vaf = gk20a_readl(g, gr_gpcs_tpcs_pe_vaf_r());
1061 pe_vsc_vpc = gk20a_readl(g, gr_gpcs_tpcs_pes_vsc_vpc_r());
1063 gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_enable_f() | gpm_pd_cfg;
1064 pe_vaf = gr_gpcs_tpcs_pe_vaf_fast_mode_switch_true_f() | pe_vaf;
1065 pe_vsc_vpc = gr_gpcs_tpcs_pes_vsc_vpc_fast_mode_switch_true_f() | pe_vsc_vpc;
1066 pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_en_f() | pd_ab_dist_cfg0;
1067 ds_debug = gr_ds_debug_timeslice_mode_enable_f() | ds_debug;
1068 mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_enabled_f() | mpc_vtg_debug;
1070 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
1071 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_vaf_r(), pe_vaf, patch);
1072 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pes_vsc_vpc_r(), pe_vsc_vpc, patch);
1073 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
1074 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
1075 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
1077 gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_disable_f() | gpm_pd_cfg;
1078 pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_dis_f() | pd_ab_dist_cfg0;
1079 ds_debug = gr_ds_debug_timeslice_mode_disable_f() | ds_debug;
1080 mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_disabled_f() | mpc_vtg_debug;
1082 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
1083 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
1084 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
1085 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
1089 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
1094 int gr_gk20a_setup_rop_mapping(struct gk20a *g, struct gr_gk20a *gr)
1096 u32 norm_entries, norm_shift;
1097 u32 coeff5_mod, coeff6_mod, coeff7_mod, coeff8_mod, coeff9_mod, coeff10_mod, coeff11_mod;
1098 u32 map0, map1, map2, map3, map4, map5;
1105 gk20a_writel(g, gr_crstr_map_table_cfg_r(),
1106 gr_crstr_map_table_cfg_row_offset_f(gr->map_row_offset) |
1107 gr_crstr_map_table_cfg_num_entries_f(gr->tpc_count));
1109 map0 = gr_crstr_gpc_map0_tile0_f(gr->map_tiles[0]) |
1110 gr_crstr_gpc_map0_tile1_f(gr->map_tiles[1]) |
1111 gr_crstr_gpc_map0_tile2_f(gr->map_tiles[2]) |
1112 gr_crstr_gpc_map0_tile3_f(gr->map_tiles[3]) |
1113 gr_crstr_gpc_map0_tile4_f(gr->map_tiles[4]) |
1114 gr_crstr_gpc_map0_tile5_f(gr->map_tiles[5]);
1116 map1 = gr_crstr_gpc_map1_tile6_f(gr->map_tiles[6]) |
1117 gr_crstr_gpc_map1_tile7_f(gr->map_tiles[7]) |
1118 gr_crstr_gpc_map1_tile8_f(gr->map_tiles[8]) |
1119 gr_crstr_gpc_map1_tile9_f(gr->map_tiles[9]) |
1120 gr_crstr_gpc_map1_tile10_f(gr->map_tiles[10]) |
1121 gr_crstr_gpc_map1_tile11_f(gr->map_tiles[11]);
1123 map2 = gr_crstr_gpc_map2_tile12_f(gr->map_tiles[12]) |
1124 gr_crstr_gpc_map2_tile13_f(gr->map_tiles[13]) |
1125 gr_crstr_gpc_map2_tile14_f(gr->map_tiles[14]) |
1126 gr_crstr_gpc_map2_tile15_f(gr->map_tiles[15]) |
1127 gr_crstr_gpc_map2_tile16_f(gr->map_tiles[16]) |
1128 gr_crstr_gpc_map2_tile17_f(gr->map_tiles[17]);
1130 map3 = gr_crstr_gpc_map3_tile18_f(gr->map_tiles[18]) |
1131 gr_crstr_gpc_map3_tile19_f(gr->map_tiles[19]) |
1132 gr_crstr_gpc_map3_tile20_f(gr->map_tiles[20]) |
1133 gr_crstr_gpc_map3_tile21_f(gr->map_tiles[21]) |
1134 gr_crstr_gpc_map3_tile22_f(gr->map_tiles[22]) |
1135 gr_crstr_gpc_map3_tile23_f(gr->map_tiles[23]);
1137 map4 = gr_crstr_gpc_map4_tile24_f(gr->map_tiles[24]) |
1138 gr_crstr_gpc_map4_tile25_f(gr->map_tiles[25]) |
1139 gr_crstr_gpc_map4_tile26_f(gr->map_tiles[26]) |
1140 gr_crstr_gpc_map4_tile27_f(gr->map_tiles[27]) |
1141 gr_crstr_gpc_map4_tile28_f(gr->map_tiles[28]) |
1142 gr_crstr_gpc_map4_tile29_f(gr->map_tiles[29]);
1144 map5 = gr_crstr_gpc_map5_tile30_f(gr->map_tiles[30]) |
1145 gr_crstr_gpc_map5_tile31_f(gr->map_tiles[31]) |
1146 gr_crstr_gpc_map5_tile32_f(0) |
1147 gr_crstr_gpc_map5_tile33_f(0) |
1148 gr_crstr_gpc_map5_tile34_f(0) |
1149 gr_crstr_gpc_map5_tile35_f(0);
1151 gk20a_writel(g, gr_crstr_gpc_map0_r(), map0);
1152 gk20a_writel(g, gr_crstr_gpc_map1_r(), map1);
1153 gk20a_writel(g, gr_crstr_gpc_map2_r(), map2);
1154 gk20a_writel(g, gr_crstr_gpc_map3_r(), map3);
1155 gk20a_writel(g, gr_crstr_gpc_map4_r(), map4);
1156 gk20a_writel(g, gr_crstr_gpc_map5_r(), map5);
1158 switch (gr->tpc_count) {
1187 norm_entries = gr->tpc_count << norm_shift;
1188 coeff5_mod = (1 << 5) % norm_entries;
1189 coeff6_mod = (1 << 6) % norm_entries;
1190 coeff7_mod = (1 << 7) % norm_entries;
1191 coeff8_mod = (1 << 8) % norm_entries;
1192 coeff9_mod = (1 << 9) % norm_entries;
1193 coeff10_mod = (1 << 10) % norm_entries;
1194 coeff11_mod = (1 << 11) % norm_entries;
1196 gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg_r(),
1197 gr_ppcs_wwdx_map_table_cfg_row_offset_f(gr->map_row_offset) |
1198 gr_ppcs_wwdx_map_table_cfg_normalized_num_entries_f(norm_entries) |
1199 gr_ppcs_wwdx_map_table_cfg_normalized_shift_value_f(norm_shift) |
1200 gr_ppcs_wwdx_map_table_cfg_coeff5_mod_value_f(coeff5_mod) |
1201 gr_ppcs_wwdx_map_table_cfg_num_entries_f(gr->tpc_count));
1203 gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg2_r(),
1204 gr_ppcs_wwdx_map_table_cfg2_coeff6_mod_value_f(coeff6_mod) |
1205 gr_ppcs_wwdx_map_table_cfg2_coeff7_mod_value_f(coeff7_mod) |
1206 gr_ppcs_wwdx_map_table_cfg2_coeff8_mod_value_f(coeff8_mod) |
1207 gr_ppcs_wwdx_map_table_cfg2_coeff9_mod_value_f(coeff9_mod) |
1208 gr_ppcs_wwdx_map_table_cfg2_coeff10_mod_value_f(coeff10_mod) |
1209 gr_ppcs_wwdx_map_table_cfg2_coeff11_mod_value_f(coeff11_mod));
1211 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map0_r(), map0);
1212 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map1_r(), map1);
1213 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map2_r(), map2);
1214 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map3_r(), map3);
1215 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map4_r(), map4);
1216 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map5_r(), map5);
1218 gk20a_writel(g, gr_rstr2d_map_table_cfg_r(),
1219 gr_rstr2d_map_table_cfg_row_offset_f(gr->map_row_offset) |
1220 gr_rstr2d_map_table_cfg_num_entries_f(gr->tpc_count));
1222 gk20a_writel(g, gr_rstr2d_gpc_map0_r(), map0);
1223 gk20a_writel(g, gr_rstr2d_gpc_map1_r(), map1);
1224 gk20a_writel(g, gr_rstr2d_gpc_map2_r(), map2);
1225 gk20a_writel(g, gr_rstr2d_gpc_map3_r(), map3);
1226 gk20a_writel(g, gr_rstr2d_gpc_map4_r(), map4);
1227 gk20a_writel(g, gr_rstr2d_gpc_map5_r(), map5);
1232 static inline u32 count_bits(u32 mask)
1236 for (count = 0; temp != 0; count++)
1242 static inline u32 clear_count_bits(u32 num, u32 clear_count)
1244 u32 count = clear_count;
1245 for (; (num != 0) && (count != 0); count--)
1251 static int gr_gk20a_setup_alpha_beta_tables(struct gk20a *g,
1252 struct gr_gk20a *gr)
1254 u32 table_index_bits = 5;
1255 u32 rows = (1 << table_index_bits);
1256 u32 row_stride = gr_pd_alpha_ratio_table__size_1_v() / rows;
1261 u32 gpcs_per_reg = 4;
1264 u32 num_pes_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
1266 u32 alpha_target, beta_target;
1267 u32 alpha_bits, beta_bits;
1268 u32 alpha_mask, beta_mask, partial_mask;
1272 u32 map_alpha[gr_pd_alpha_ratio_table__size_1_v()];
1273 u32 map_beta[gr_pd_alpha_ratio_table__size_1_v()];
1274 u32 map_reg_used[gr_pd_alpha_ratio_table__size_1_v()];
1278 memset(map_alpha, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1279 memset(map_beta, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1280 memset(map_reg_used, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1282 for (row = 0; row < rows; ++row) {
1283 alpha_target = max_t(u32, gr->tpc_count * row / rows, 1);
1284 beta_target = gr->tpc_count - alpha_target;
1286 assign_alpha = (alpha_target < beta_target);
1288 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
1289 reg_offset = (row * row_stride) + (gpc_index / gpcs_per_reg);
1290 alpha_mask = beta_mask = 0;
1292 for (pes_index = 0; pes_index < num_pes_per_gpc; pes_index++) {
1293 tpc_count_pes = gr->pes_tpc_count[pes_index][gpc_index];
1296 alpha_bits = (alpha_target == 0) ? 0 : tpc_count_pes;
1297 beta_bits = tpc_count_pes - alpha_bits;
1299 beta_bits = (beta_target == 0) ? 0 : tpc_count_pes;
1300 alpha_bits = tpc_count_pes - beta_bits;
1303 partial_mask = gr->pes_tpc_mask[pes_index][gpc_index];
1304 partial_mask = clear_count_bits(partial_mask, tpc_count_pes - alpha_bits);
1305 alpha_mask |= partial_mask;
1307 partial_mask = gr->pes_tpc_mask[pes_index][gpc_index] ^ partial_mask;
1308 beta_mask |= partial_mask;
1310 alpha_target -= min(alpha_bits, alpha_target);
1311 beta_target -= min(beta_bits, beta_target);
1313 if ((alpha_bits > 0) || (beta_bits > 0))
1314 assign_alpha = !assign_alpha;
1317 switch (gpc_index % gpcs_per_reg) {
1319 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n0_mask_f(alpha_mask);
1320 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n0_mask_f(beta_mask);
1323 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n1_mask_f(alpha_mask);
1324 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n1_mask_f(beta_mask);
1327 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n2_mask_f(alpha_mask);
1328 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n2_mask_f(beta_mask);
1331 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n3_mask_f(alpha_mask);
1332 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n3_mask_f(beta_mask);
1335 map_reg_used[reg_offset] = true;
1339 for (index = 0; index < gr_pd_alpha_ratio_table__size_1_v(); index++) {
1340 if (map_reg_used[index]) {
1341 gk20a_writel(g, gr_pd_alpha_ratio_table_r(index), map_alpha[index]);
1342 gk20a_writel(g, gr_pd_beta_ratio_table_r(index), map_beta[index]);
1349 static int gr_gk20a_ctx_state_floorsweep(struct gk20a *g)
1351 struct gr_gk20a *gr = &g->gr;
1352 u32 tpc_index, gpc_index;
1353 u32 tpc_offset, gpc_offset;
1354 u32 sm_id = 0, gpc_id = 0;
1355 u32 sm_id_to_gpc_id[proj_scal_max_gpcs_v() * proj_scal_max_tpc_per_gpc_v()];
1357 u32 max_ways_evict = INVALID_MAX_WAYS;
1358 u32 l1c_dbg_reg_val;
1362 for (tpc_index = 0; tpc_index < gr->max_tpc_per_gpc_count; tpc_index++) {
1363 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
1364 gpc_offset = proj_gpc_stride_v() * gpc_index;
1365 if (tpc_index < gr->gpc_tpc_count[gpc_index]) {
1366 tpc_offset = proj_tpc_in_gpc_stride_v() * tpc_index;
1368 gk20a_writel(g, gr_gpc0_tpc0_sm_cfg_r() + gpc_offset + tpc_offset,
1369 gr_gpc0_tpc0_sm_cfg_sm_id_f(sm_id));
1370 gk20a_writel(g, gr_gpc0_tpc0_l1c_cfg_smid_r() + gpc_offset + tpc_offset,
1371 gr_gpc0_tpc0_l1c_cfg_smid_value_f(sm_id));
1372 gk20a_writel(g, gr_gpc0_gpm_pd_sm_id_r(tpc_index) + gpc_offset,
1373 gr_gpc0_gpm_pd_sm_id_id_f(sm_id));
1374 gk20a_writel(g, gr_gpc0_tpc0_pe_cfg_smid_r() + gpc_offset + tpc_offset,
1375 gr_gpc0_tpc0_pe_cfg_smid_value_f(sm_id));
1377 sm_id_to_gpc_id[sm_id] = gpc_index;
1381 gk20a_writel(g, gr_gpc0_gpm_pd_active_tpcs_r() + gpc_offset,
1382 gr_gpc0_gpm_pd_active_tpcs_num_f(gr->gpc_tpc_count[gpc_index]));
1383 gk20a_writel(g, gr_gpc0_gpm_sd_active_tpcs_r() + gpc_offset,
1384 gr_gpc0_gpm_sd_active_tpcs_num_f(gr->gpc_tpc_count[gpc_index]));
1388 for (tpc_index = 0, gpc_id = 0;
1389 tpc_index < gr_pd_num_tpc_per_gpc__size_1_v();
1390 tpc_index++, gpc_id += 8) {
1392 if (gpc_id >= gr->gpc_count)
1396 gr_pd_num_tpc_per_gpc_count0_f(gr->gpc_tpc_count[gpc_id + 0]) |
1397 gr_pd_num_tpc_per_gpc_count1_f(gr->gpc_tpc_count[gpc_id + 1]) |
1398 gr_pd_num_tpc_per_gpc_count2_f(gr->gpc_tpc_count[gpc_id + 2]) |
1399 gr_pd_num_tpc_per_gpc_count3_f(gr->gpc_tpc_count[gpc_id + 3]) |
1400 gr_pd_num_tpc_per_gpc_count4_f(gr->gpc_tpc_count[gpc_id + 4]) |
1401 gr_pd_num_tpc_per_gpc_count5_f(gr->gpc_tpc_count[gpc_id + 5]) |
1402 gr_pd_num_tpc_per_gpc_count6_f(gr->gpc_tpc_count[gpc_id + 6]) |
1403 gr_pd_num_tpc_per_gpc_count7_f(gr->gpc_tpc_count[gpc_id + 7]);
1405 gk20a_writel(g, gr_pd_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
1406 gk20a_writel(g, gr_ds_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
1409 /* gr__setup_pd_mapping stubbed for gk20a */
1410 gr_gk20a_setup_rop_mapping(g, gr);
1411 if (g->ops.gr.setup_alpha_beta_tables)
1412 g->ops.gr.setup_alpha_beta_tables(g, gr);
1414 if (gr->num_fbps == 1)
1417 if (max_ways_evict != INVALID_MAX_WAYS)
1418 g->ops.ltc.set_max_ways_evict_last(g, max_ways_evict);
1421 gpc_index < gr_pd_dist_skip_table__size_1_v() * 4;
1424 gk20a_writel(g, gr_pd_dist_skip_table_r(gpc_index/4),
1425 gr_pd_dist_skip_table_gpc_4n0_mask_f(gr->gpc_skip_mask[gpc_index]) ||
1426 gr_pd_dist_skip_table_gpc_4n1_mask_f(gr->gpc_skip_mask[gpc_index + 1]) ||
1427 gr_pd_dist_skip_table_gpc_4n2_mask_f(gr->gpc_skip_mask[gpc_index + 2]) ||
1428 gr_pd_dist_skip_table_gpc_4n3_mask_f(gr->gpc_skip_mask[gpc_index + 3]));
1431 gk20a_writel(g, gr_cwd_fs_r(),
1432 gr_cwd_fs_num_gpcs_f(gr->gpc_count) |
1433 gr_cwd_fs_num_tpcs_f(gr->tpc_count));
1435 gk20a_writel(g, gr_bes_zrop_settings_r(),
1436 gr_bes_zrop_settings_num_active_fbps_f(gr->num_fbps));
1437 gk20a_writel(g, gr_bes_crop_settings_r(),
1438 gr_bes_crop_settings_num_active_fbps_f(gr->num_fbps));
1440 /* turn on cya15 bit for a default val that missed the cut */
1441 l1c_dbg_reg_val = gk20a_readl(g, gr_gpc0_tpc0_l1c_dbg_r());
1442 l1c_dbg_reg_val |= gr_gpc0_tpc0_l1c_dbg_cya15_en_f();
1443 gk20a_writel(g, gr_gpc0_tpc0_l1c_dbg_r(), l1c_dbg_reg_val);
1448 static int gr_gk20a_fecs_ctx_image_save(struct channel_gk20a *c, u32 save_type)
1450 struct gk20a *g = c->g;
1454 u64_lo32(c->inst_block.cpu_pa
1455 >> ram_in_base_shift_v());
1460 ret = gr_gk20a_submit_fecs_method_op(g,
1461 (struct fecs_method_op_gk20a) {
1462 .method.addr = save_type,
1463 .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
1464 gr_fecs_current_ctx_target_vid_mem_f() |
1465 gr_fecs_current_ctx_valid_f(1)),
1466 .mailbox = {.id = 0, .data = 0, .clr = 3, .ret = NULL,
1469 .cond.ok = GR_IS_UCODE_OP_AND,
1470 .cond.fail = GR_IS_UCODE_OP_AND,
1474 gk20a_err(dev_from_gk20a(g), "save context image failed");
1479 static u32 gk20a_init_sw_bundle(struct gk20a *g)
1481 struct av_list_gk20a *sw_bundle_init = &g->gr.ctx_vars.sw_bundle_init;
1482 u32 last_bundle_data = 0;
1485 unsigned long end_jiffies = jiffies +
1486 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
1488 /* enable pipe mode override */
1489 gk20a_writel(g, gr_pipe_bundle_config_r(),
1490 gr_pipe_bundle_config_override_pipe_mode_enabled_f());
1492 /* load bundle init */
1493 for (i = 0; i < sw_bundle_init->count; i++) {
1494 err |= gr_gk20a_wait_fe_idle(g, end_jiffies,
1495 GR_IDLE_CHECK_DEFAULT);
1496 if (i == 0 || last_bundle_data != sw_bundle_init->l[i].value) {
1497 gk20a_writel(g, gr_pipe_bundle_data_r(),
1498 sw_bundle_init->l[i].value);
1499 last_bundle_data = sw_bundle_init->l[i].value;
1502 gk20a_writel(g, gr_pipe_bundle_address_r(),
1503 sw_bundle_init->l[i].addr);
1505 if (gr_pipe_bundle_address_value_v(sw_bundle_init->l[i].addr) ==
1507 err |= gr_gk20a_wait_idle(g, end_jiffies,
1508 GR_IDLE_CHECK_DEFAULT);
1511 /* disable pipe mode override */
1512 gk20a_writel(g, gr_pipe_bundle_config_r(),
1513 gr_pipe_bundle_config_override_pipe_mode_disabled_f());
1518 /* init global golden image from a fresh gr_ctx in channel ctx.
1519 save a copy in local_golden_image in ctx_vars */
1520 static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
1521 struct channel_gk20a *c)
1523 struct gr_gk20a *gr = &g->gr;
1524 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1525 u32 ctx_header_bytes = ctxsw_prog_fecs_header_v();
1526 u32 ctx_header_words;
1529 void *ctx_ptr = NULL;
1530 void *gold_ptr = NULL;
1535 /* golden ctx is global to all channels. Although only the first
1536 channel initializes golden image, driver needs to prevent multiple
1537 channels from initializing golden ctx at the same time */
1538 mutex_lock(&gr->ctx_mutex);
1540 if (gr->ctx_vars.golden_image_initialized)
1543 err = gr_gk20a_fecs_ctx_bind_channel(g, c);
1547 err = gk20a_init_sw_bundle(g);
1551 err = gr_gk20a_elpg_protected_call(g,
1552 gr_gk20a_commit_global_ctx_buffers(g, c, false));
1556 gold_ptr = vmap(gr->global_ctx_buffer[GOLDEN_CTX].pages,
1557 PAGE_ALIGN(gr->global_ctx_buffer[GOLDEN_CTX].size) >>
1558 PAGE_SHIFT, 0, pgprot_dmacoherent(PAGE_KERNEL));
1562 ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
1563 PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
1564 0, pgprot_dmacoherent(PAGE_KERNEL));
1568 ctx_header_words = roundup(ctx_header_bytes, sizeof(u32));
1569 ctx_header_words >>= 2;
1571 gk20a_mm_l2_flush(g, true);
1573 for (i = 0; i < ctx_header_words; i++) {
1574 data = gk20a_mem_rd32(ctx_ptr, i);
1575 gk20a_mem_wr32(gold_ptr, i, data);
1578 gk20a_mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_o(), 0,
1579 ctxsw_prog_main_image_zcull_mode_no_ctxsw_v());
1581 gk20a_mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, 0);
1583 gr_gk20a_commit_inst(c, ch_ctx->global_ctx_buffer_va[GOLDEN_CTX_VA]);
1585 gr_gk20a_fecs_ctx_image_save(c, gr_fecs_method_push_adr_wfi_golden_save_v());
1587 if (gr->ctx_vars.local_golden_image == NULL) {
1589 gr->ctx_vars.local_golden_image =
1590 kzalloc(gr->ctx_vars.golden_image_size, GFP_KERNEL);
1592 if (gr->ctx_vars.local_golden_image == NULL) {
1597 for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
1598 gr->ctx_vars.local_golden_image[i] =
1599 gk20a_mem_rd32(gold_ptr, i);
1602 gr_gk20a_commit_inst(c, ch_ctx->gr_ctx.gpu_va);
1604 gr->ctx_vars.golden_image_initialized = true;
1606 gk20a_writel(g, gr_fecs_current_ctx_r(),
1607 gr_fecs_current_ctx_valid_false_f());
1611 gk20a_err(dev_from_gk20a(g), "fail");
1613 gk20a_dbg_fn("done");
1620 mutex_unlock(&gr->ctx_mutex);
1624 int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
1625 struct channel_gk20a *c,
1626 bool enable_smpc_ctxsw)
1628 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1629 void *ctx_ptr = NULL;
1632 /* Channel gr_ctx buffer is gpu cacheable.
1633 Flush and invalidate before cpu update. */
1634 gk20a_mm_l2_flush(g, true);
1636 ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
1637 PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
1638 0, pgprot_dmacoherent(PAGE_KERNEL));
1642 data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
1643 data = data & ~ctxsw_prog_main_image_pm_smpc_mode_m();
1644 data |= enable_smpc_ctxsw ?
1645 ctxsw_prog_main_image_pm_smpc_mode_ctxsw_f() :
1646 ctxsw_prog_main_image_pm_smpc_mode_no_ctxsw_f();
1647 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0,
1655 /* load saved fresh copy of gloden image into channel gr_ctx */
1656 static int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
1657 struct channel_gk20a *c)
1659 struct gr_gk20a *gr = &g->gr;
1660 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1665 void *ctx_ptr = NULL;
1669 if (gr->ctx_vars.local_golden_image == NULL)
1672 /* Channel gr_ctx buffer is gpu cacheable.
1673 Flush and invalidate before cpu update. */
1674 gk20a_mm_l2_flush(g, true);
1676 ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
1677 PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
1678 0, pgprot_dmacoherent(PAGE_KERNEL));
1682 for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
1683 gk20a_mem_wr32(ctx_ptr, i, gr->ctx_vars.local_golden_image[i]);
1685 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_save_ops_o(), 0, 0);
1686 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_restore_ops_o(), 0, 0);
1688 virt_addr_lo = u64_lo32(ch_ctx->patch_ctx.gpu_va);
1689 virt_addr_hi = u64_hi32(ch_ctx->patch_ctx.gpu_va);
1691 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_count_o(), 0,
1692 ch_ctx->patch_ctx.data_count);
1693 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_lo_o(), 0,
1695 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_hi_o(), 0,
1698 /* no user for client managed performance counter ctx */
1699 ch_ctx->pm_ctx.ctx_sw_mode =
1700 ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
1701 data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
1702 data = data & ~ctxsw_prog_main_image_pm_mode_m();
1703 data |= ch_ctx->pm_ctx.ctx_sw_mode;
1704 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0,
1707 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_o(), 0, 0);
1709 /* set priv access map */
1711 u64_lo32(ch_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
1713 u64_hi32(ch_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
1715 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_config_o(), 0,
1716 ctxsw_prog_main_image_priv_access_map_config_mode_use_map_f());
1717 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_lo_o(), 0,
1719 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_hi_o(), 0,
1721 /* disable verif features */
1722 v = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0);
1723 v = v & ~(ctxsw_prog_main_image_misc_options_verif_features_m());
1724 v = v | ctxsw_prog_main_image_misc_options_verif_features_disabled_f();
1725 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0, v);
1730 if (tegra_platform_is_linsim()) {
1732 u64_lo32(c->inst_block.cpu_pa
1733 >> ram_in_base_shift_v());
1735 ret = gr_gk20a_submit_fecs_method_op(g,
1736 (struct fecs_method_op_gk20a) {
1738 (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
1739 gr_fecs_current_ctx_target_vid_mem_f() |
1740 gr_fecs_current_ctx_valid_f(1)),
1742 gr_fecs_method_push_adr_restore_golden_v(),
1745 .clr = ~0, .ret = NULL,
1746 .ok = gr_fecs_ctxsw_mailbox_value_pass_v(),
1748 .cond.ok = GR_IS_UCODE_OP_EQUAL,
1749 .cond.fail = GR_IS_UCODE_OP_SKIP});
1752 gk20a_err(dev_from_gk20a(g),
1753 "restore context image failed");
1759 static void gr_gk20a_start_falcon_ucode(struct gk20a *g)
1763 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
1764 gr_fecs_ctxsw_mailbox_clear_value_f(~0));
1766 gk20a_writel(g, gr_gpccs_dmactl_r(), gr_gpccs_dmactl_require_ctx_f(0));
1767 gk20a_writel(g, gr_fecs_dmactl_r(), gr_fecs_dmactl_require_ctx_f(0));
1769 gk20a_writel(g, gr_gpccs_cpuctl_r(), gr_gpccs_cpuctl_startcpu_f(1));
1770 gk20a_writel(g, gr_fecs_cpuctl_r(), gr_fecs_cpuctl_startcpu_f(1));
1772 gk20a_dbg_fn("done");
1775 static int gr_gk20a_init_ctxsw_ucode_vaspace(struct gk20a *g)
1777 struct mm_gk20a *mm = &g->mm;
1778 struct vm_gk20a *vm = &mm->pmu.vm;
1779 struct device *d = dev_from_gk20a(g);
1780 struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
1787 /* Alloc mem of inst block */
1788 ucode_info->inst_blk_desc.size = ram_in_alloc_size_v();
1789 ucode_info->inst_blk_desc.cpuva = dma_alloc_coherent(d,
1790 ucode_info->inst_blk_desc.size,
1793 if (!ucode_info->inst_blk_desc.cpuva) {
1794 gk20a_err(d, "failed to allocate memory\n");
1798 ucode_info->inst_blk_desc.iova = iova;
1799 ucode_info->inst_blk_desc.cpu_pa = gk20a_get_phys_from_iova(d,
1800 ucode_info->inst_blk_desc.iova);
1802 inst_ptr = ucode_info->inst_blk_desc.cpuva;
1804 /* Set inst block */
1805 gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
1806 u64_lo32(vm->va_limit) | 0xFFF);
1807 gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
1808 ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit)));
1810 pde_addr = gk20a_mm_iova_addr(vm->pdes.sgt->sgl);
1811 pde_addr_lo = u64_lo32(pde_addr >> 12);
1812 pde_addr_hi = u64_hi32(pde_addr);
1813 gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
1814 ram_in_page_dir_base_target_vid_mem_f() |
1815 ram_in_page_dir_base_vol_true_f() |
1816 ram_in_page_dir_base_lo_f(pde_addr_lo));
1817 gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
1818 ram_in_page_dir_base_hi_f(pde_addr_hi));
1820 /* Map ucode surface to GMMU */
1821 ucode_info->ucode_gpuva = gk20a_gmmu_map(vm,
1822 &ucode_info->surface_desc.sgt,
1823 ucode_info->surface_desc.size,
1825 gk20a_mem_flag_read_only);
1826 if (!ucode_info->ucode_gpuva) {
1827 gk20a_err(d, "failed to update gmmu ptes\n");
1834 static void gr_gk20a_init_ctxsw_ucode_segment(
1835 struct gk20a_ctxsw_ucode_segment *p_seg, u32 *offset, u32 size)
1837 p_seg->offset = *offset;
1839 *offset = ALIGN(*offset + size, BLK_SIZE);
1842 static void gr_gk20a_init_ctxsw_ucode_segments(
1843 struct gk20a_ctxsw_ucode_segments *segments, u32 *offset,
1844 struct gk20a_ctxsw_bootloader_desc *bootdesc,
1845 u32 code_size, u32 data_size)
1847 u32 boot_size = ALIGN(bootdesc->size, sizeof(u32));
1848 segments->boot_entry = bootdesc->entry_point;
1849 segments->boot_imem_offset = bootdesc->imem_offset;
1850 gr_gk20a_init_ctxsw_ucode_segment(&segments->boot, offset, boot_size);
1851 gr_gk20a_init_ctxsw_ucode_segment(&segments->code, offset, code_size);
1852 gr_gk20a_init_ctxsw_ucode_segment(&segments->data, offset, data_size);
1855 static int gr_gk20a_copy_ctxsw_ucode_segments(
1857 struct gk20a_ctxsw_ucode_segments *segments,
1859 u32 *code, u32 *data)
1861 memcpy(buf + segments->boot.offset, bootimage, segments->boot.size);
1862 memcpy(buf + segments->code.offset, code, segments->code.size);
1863 memcpy(buf + segments->data.offset, data, segments->data.size);
1867 static int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
1869 struct device *d = dev_from_gk20a(g);
1870 struct mm_gk20a *mm = &g->mm;
1871 struct vm_gk20a *vm = &mm->pmu.vm;
1872 struct gk20a_ctxsw_bootloader_desc *fecs_boot_desc;
1873 struct gk20a_ctxsw_bootloader_desc *gpccs_boot_desc;
1874 const struct firmware *fecs_fw;
1875 const struct firmware *gpccs_fw;
1876 u32 *fecs_boot_image;
1877 u32 *gpccs_boot_image;
1878 struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
1883 DEFINE_DMA_ATTRS(attrs);
1885 fecs_fw = gk20a_request_firmware(g, GK20A_FECS_UCODE_IMAGE);
1887 gk20a_err(d, "failed to load fecs ucode!!");
1891 fecs_boot_desc = (void *)fecs_fw->data;
1892 fecs_boot_image = (void *)(fecs_fw->data +
1893 sizeof(struct gk20a_ctxsw_bootloader_desc));
1895 gpccs_fw = gk20a_request_firmware(g, GK20A_GPCCS_UCODE_IMAGE);
1897 release_firmware(fecs_fw);
1898 gk20a_err(d, "failed to load gpccs ucode!!");
1902 gpccs_boot_desc = (void *)gpccs_fw->data;
1903 gpccs_boot_image = (void *)(gpccs_fw->data +
1904 sizeof(struct gk20a_ctxsw_bootloader_desc));
1907 gr_gk20a_init_ctxsw_ucode_segments(&ucode_info->fecs, &ucode_size,
1909 g->gr.ctx_vars.ucode.fecs.inst.count * sizeof(u32),
1910 g->gr.ctx_vars.ucode.fecs.data.count * sizeof(u32));
1911 gr_gk20a_init_ctxsw_ucode_segments(&ucode_info->gpccs, &ucode_size,
1913 g->gr.ctx_vars.ucode.gpccs.inst.count * sizeof(u32),
1914 g->gr.ctx_vars.ucode.gpccs.data.count * sizeof(u32));
1916 ucode_info->surface_desc.size = ucode_size;
1917 dma_set_attr(DMA_ATTR_READ_ONLY, &attrs);
1918 ucode_info->surface_desc.cpuva = dma_alloc_attrs(d,
1919 ucode_info->surface_desc.size,
1923 if (!ucode_info->surface_desc.cpuva) {
1924 gk20a_err(d, "memory allocation failed\n");
1929 ucode_info->surface_desc.iova = iova;
1930 err = gk20a_get_sgtable(d, &ucode_info->surface_desc.sgt,
1931 ucode_info->surface_desc.cpuva,
1932 ucode_info->surface_desc.iova,
1933 ucode_info->surface_desc.size);
1935 gk20a_err(d, "failed to create sg table\n");
1939 buf = (u8 *)ucode_info->surface_desc.cpuva;
1941 gk20a_err(d, "failed to map surface desc buffer");
1946 gr_gk20a_copy_ctxsw_ucode_segments(buf, &ucode_info->fecs,
1948 g->gr.ctx_vars.ucode.fecs.inst.l,
1949 g->gr.ctx_vars.ucode.fecs.data.l);
1951 release_firmware(fecs_fw);
1954 gr_gk20a_copy_ctxsw_ucode_segments(buf, &ucode_info->gpccs,
1956 g->gr.ctx_vars.ucode.gpccs.inst.l,
1957 g->gr.ctx_vars.ucode.gpccs.data.l);
1959 release_firmware(gpccs_fw);
1962 err = gr_gk20a_init_ctxsw_ucode_vaspace(g);
1966 gk20a_free_sgtable(&ucode_info->surface_desc.sgt);
1971 if (ucode_info->ucode_gpuva)
1972 gk20a_gmmu_unmap(vm, ucode_info->ucode_gpuva,
1973 ucode_info->surface_desc.size, gk20a_mem_flag_none);
1974 if (ucode_info->surface_desc.sgt)
1975 gk20a_free_sgtable(&ucode_info->surface_desc.sgt);
1976 if (ucode_info->surface_desc.cpuva)
1977 dma_free_attrs(d, ucode_info->surface_desc.size,
1978 ucode_info->surface_desc.cpuva,
1979 ucode_info->surface_desc.iova,
1981 ucode_info->surface_desc.cpuva = NULL;
1982 ucode_info->surface_desc.iova = 0;
1984 release_firmware(gpccs_fw);
1986 release_firmware(fecs_fw);
1992 static void gr_gk20a_load_falcon_bind_instblk(struct gk20a *g)
1994 struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
1996 phys_addr_t inst_ptr;
1999 while ((gk20a_readl(g, gr_fecs_ctxsw_status_1_r()) &
2000 gr_fecs_ctxsw_status_1_arb_busy_m()) && retries) {
2005 gk20a_err(dev_from_gk20a(g), "arbiter idle timeout");
2007 gk20a_writel(g, gr_fecs_arb_ctx_adr_r(), 0x0);
2009 inst_ptr = ucode_info->inst_blk_desc.cpu_pa;
2010 gk20a_writel(g, gr_fecs_new_ctx_r(),
2011 gr_fecs_new_ctx_ptr_f(inst_ptr >> 12) |
2012 gr_fecs_new_ctx_target_m() |
2013 gr_fecs_new_ctx_valid_m());
2015 gk20a_writel(g, gr_fecs_arb_ctx_ptr_r(),
2016 gr_fecs_arb_ctx_ptr_ptr_f(inst_ptr >> 12) |
2017 gr_fecs_arb_ctx_ptr_target_m());
2019 gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), 0x7);
2021 /* Wait for arbiter command to complete */
2023 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
2024 while (gr_fecs_arb_ctx_cmd_cmd_v(val) && retries) {
2027 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
2030 gk20a_err(dev_from_gk20a(g), "arbiter complete timeout");
2032 gk20a_writel(g, gr_fecs_current_ctx_r(),
2033 gr_fecs_current_ctx_ptr_f(inst_ptr >> 12) |
2034 gr_fecs_current_ctx_target_m() |
2035 gr_fecs_current_ctx_valid_m());
2036 /* Send command to arbiter to flush */
2037 gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), gr_fecs_arb_ctx_cmd_cmd_s());
2040 val = (gk20a_readl(g, gr_fecs_arb_ctx_cmd_r()));
2041 while (gr_fecs_arb_ctx_cmd_cmd_v(val) && retries) {
2044 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
2047 gk20a_err(dev_from_gk20a(g), "arbiter complete timeout");
2050 static int gr_gk20a_load_ctxsw_ucode_segments(struct gk20a *g, u64 addr_base,
2051 struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset)
2060 addr_code32 = u64_lo32((addr_base + segments->code.offset) >> 8);
2061 addr_data32 = u64_lo32((addr_base + segments->data.offset) >> 8);
2062 addr_load32 = u64_lo32((addr_base + segments->boot.offset) >> 8);
2064 gk20a_writel(g, reg_offset + gr_fecs_dmactl_r(),
2065 gr_fecs_dmactl_require_ctx_f(0));
2068 * Copy falcon bootloader header into dmem at offset 0.
2069 * Configure dmem port 0 for auto-incrementing writes starting at dmem
2072 gk20a_writel(g, reg_offset + gr_fecs_dmemc_r(0),
2073 gr_fecs_dmemc_offs_f(0) |
2074 gr_fecs_dmemc_blk_f(0) |
2075 gr_fecs_dmemc_aincw_f(1));
2077 /* Write out the actual data */
2078 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2079 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_code32);
2080 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2081 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), segments->code.size);
2082 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2083 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_data32);
2084 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), segments->data.size);
2085 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_code32);
2086 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2087 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2089 blocks = ((segments->boot.size + 0xFF) & ~0xFF) >> 8;
2092 * Set the base FB address for the DMA transfer. Subtract off the 256
2093 * byte IMEM block offset such that the relative FB and IMEM offsets
2094 * match, allowing the IMEM tags to be properly created.
2097 dst = segments->boot_imem_offset;
2098 gk20a_writel(g, reg_offset + gr_fecs_dmatrfbase_r(),
2099 (addr_load32 - (dst >> 8)));
2101 for (b = 0; b < blocks; b++) {
2102 /* Setup destination IMEM offset */
2103 gk20a_writel(g, reg_offset + gr_fecs_dmatrfmoffs_r(),
2106 /* Setup source offset (relative to BASE) */
2107 gk20a_writel(g, reg_offset + gr_fecs_dmatrffboffs_r(),
2110 gk20a_writel(g, reg_offset + gr_fecs_dmatrfcmd_r(),
2111 gr_fecs_dmatrfcmd_imem_f(0x01) |
2112 gr_fecs_dmatrfcmd_write_f(0x00) |
2113 gr_fecs_dmatrfcmd_size_f(0x06) |
2114 gr_fecs_dmatrfcmd_ctxdma_f(0));
2117 /* Specify the falcon boot vector */
2118 gk20a_writel(g, reg_offset + gr_fecs_bootvec_r(),
2119 gr_fecs_bootvec_vec_f(segments->boot_entry));
2121 /* Write to CPUCTL to start the falcon */
2122 gk20a_writel(g, reg_offset + gr_fecs_cpuctl_r(),
2123 gr_fecs_cpuctl_startcpu_f(0x01));
2128 static void gr_gk20a_load_falcon_with_bootloader(struct gk20a *g)
2130 struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
2131 u64 addr_base = ucode_info->ucode_gpuva;
2133 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0x0);
2135 gr_gk20a_load_falcon_bind_instblk(g);
2137 gr_gk20a_load_ctxsw_ucode_segments(g, addr_base,
2138 &g->ctxsw_ucode_info.fecs, 0);
2140 gr_gk20a_load_ctxsw_ucode_segments(g, addr_base,
2141 &g->ctxsw_ucode_info.gpccs,
2142 gr_gpcs_gpccs_falcon_hwcfg_r() -
2143 gr_fecs_falcon_hwcfg_r());
2146 static int gr_gk20a_load_ctxsw_ucode(struct gk20a *g, struct gr_gk20a *gr)
2152 if (tegra_platform_is_linsim()) {
2153 gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(7),
2154 gr_fecs_ctxsw_mailbox_value_f(0xc0de7777));
2155 gk20a_writel(g, gr_gpccs_ctxsw_mailbox_r(7),
2156 gr_gpccs_ctxsw_mailbox_value_f(0xc0de7777));
2160 * In case the gPMU falcon is not being used, revert to the old way of
2161 * loading gr ucode, without the faster bootstrap routine.
2163 if (!support_gk20a_pmu()) {
2164 gr_gk20a_load_falcon_dmem(g);
2165 gr_gk20a_load_falcon_imem(g);
2166 gr_gk20a_start_falcon_ucode(g);
2168 if (!gr->skip_ucode_init)
2169 gr_gk20a_init_ctxsw_ucode(g);
2170 gr_gk20a_load_falcon_with_bootloader(g);
2171 gr->skip_ucode_init = true;
2174 ret = gr_gk20a_ctx_wait_ucode(g, 0, 0,
2175 GR_IS_UCODE_OP_EQUAL,
2176 eUcodeHandshakeInitComplete,
2177 GR_IS_UCODE_OP_SKIP, 0);
2179 gk20a_err(dev_from_gk20a(g), "falcon ucode init timeout");
2183 if (support_gk20a_pmu())
2184 gk20a_writel(g, gr_fecs_current_ctx_r(),
2185 gr_fecs_current_ctx_valid_false_f());
2187 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0xffffffff);
2188 gk20a_writel(g, gr_fecs_method_data_r(), 0x7fffffff);
2189 gk20a_writel(g, gr_fecs_method_push_r(),
2190 gr_fecs_method_push_adr_set_watchdog_timeout_f());
2192 gk20a_dbg_fn("done");
2196 static int gr_gk20a_init_ctx_state(struct gk20a *g, struct gr_gk20a *gr)
2198 u32 golden_ctx_image_size = 0;
2199 u32 zcull_ctx_image_size = 0;
2200 u32 pm_ctx_image_size = 0;
2202 struct fecs_method_op_gk20a op = {
2203 .mailbox = { .id = 0, .data = 0,
2204 .clr = ~0, .ok = 0, .fail = 0},
2206 .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
2207 .cond.fail = GR_IS_UCODE_OP_SKIP,
2211 op.method.addr = gr_fecs_method_push_adr_discover_image_size_v();
2212 op.mailbox.ret = &golden_ctx_image_size;
2213 ret = gr_gk20a_submit_fecs_method_op(g, op);
2215 gk20a_err(dev_from_gk20a(g),
2216 "query golden image size failed");
2219 op.method.addr = gr_fecs_method_push_adr_discover_zcull_image_size_v();
2220 op.mailbox.ret = &zcull_ctx_image_size;
2221 ret = gr_gk20a_submit_fecs_method_op(g, op);
2223 gk20a_err(dev_from_gk20a(g),
2224 "query zcull ctx image size failed");
2227 op.method.addr = gr_fecs_method_push_adr_discover_pm_image_size_v();
2228 op.mailbox.ret = &pm_ctx_image_size;
2229 ret = gr_gk20a_submit_fecs_method_op(g, op);
2231 gk20a_err(dev_from_gk20a(g),
2232 "query pm ctx image size failed");
2236 if (!g->gr.ctx_vars.golden_image_size &&
2237 !g->gr.ctx_vars.zcull_ctxsw_image_size) {
2238 g->gr.ctx_vars.golden_image_size = golden_ctx_image_size;
2239 g->gr.ctx_vars.zcull_ctxsw_image_size = zcull_ctx_image_size;
2241 /* hw is different after railgating? */
2242 BUG_ON(g->gr.ctx_vars.golden_image_size != golden_ctx_image_size);
2243 BUG_ON(g->gr.ctx_vars.zcull_ctxsw_image_size != zcull_ctx_image_size);
2246 g->gr.ctx_vars.priv_access_map_size = 512 * 1024;
2248 gk20a_dbg_fn("done");
2252 static void gk20a_gr_destroy_ctx_buffer(struct platform_device *pdev,
2253 struct gr_ctx_buffer_desc *desc)
2255 struct device *dev = &pdev->dev;
2256 gk20a_free_sgtable(&desc->sgt);
2257 dma_free_attrs(dev, desc->size, desc->pages,
2258 desc->iova, &desc->attrs);
2261 static int gk20a_gr_alloc_ctx_buffer(struct platform_device *pdev,
2262 struct gr_ctx_buffer_desc *desc,
2265 struct device *dev = &pdev->dev;
2266 DEFINE_DMA_ATTRS(attrs);
2270 dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2272 desc->pages = dma_alloc_attrs(&pdev->dev, size, &iova,
2273 GFP_KERNEL, &attrs);
2279 desc->attrs = attrs;
2280 desc->destroy = gk20a_gr_destroy_ctx_buffer;
2281 err = gk20a_get_sgtable_from_pages(&pdev->dev, &desc->sgt, desc->pages,
2282 desc->iova, desc->size);
2284 dma_free_attrs(dev, desc->size, desc->pages,
2285 desc->iova, &desc->attrs);
2286 memset(desc, 0, sizeof(*desc));
2292 static int gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g)
2294 struct gk20a_platform *platform = platform_get_drvdata(g->dev);
2295 struct gr_gk20a *gr = &g->gr;
2296 int i, attr_buffer_size, err;
2297 struct platform_device *pdev = g->dev;
2299 u32 cb_buffer_size = gr->bundle_cb_default_size *
2300 gr_scc_bundle_cb_size_div_256b_byte_granularity_v();
2302 u32 pagepool_buffer_size = gr_scc_pagepool_total_pages_hwmax_value_v() *
2303 gr_scc_pagepool_total_pages_byte_granularity_v();
2307 attr_buffer_size = g->ops.gr.calc_global_ctx_buffer_size(g);
2309 gk20a_dbg_info("cb_buffer_size : %d", cb_buffer_size);
2311 err = gk20a_gr_alloc_ctx_buffer(pdev, &gr->global_ctx_buffer[CIRCULAR],
2316 if (platform->secure_alloc)
2317 platform->secure_alloc(pdev,
2318 &gr->global_ctx_buffer[CIRCULAR_VPR],
2321 gk20a_dbg_info("pagepool_buffer_size : %d", pagepool_buffer_size);
2323 err = gk20a_gr_alloc_ctx_buffer(pdev, &gr->global_ctx_buffer[PAGEPOOL],
2324 pagepool_buffer_size);
2328 if (platform->secure_alloc)
2329 platform->secure_alloc(pdev,
2330 &gr->global_ctx_buffer[PAGEPOOL_VPR],
2331 pagepool_buffer_size);
2333 gk20a_dbg_info("attr_buffer_size : %d", attr_buffer_size);
2335 err = gk20a_gr_alloc_ctx_buffer(pdev, &gr->global_ctx_buffer[ATTRIBUTE],
2340 if (platform->secure_alloc)
2341 platform->secure_alloc(pdev,
2342 &gr->global_ctx_buffer[ATTRIBUTE_VPR],
2345 if (platform->secure_buffer.destroy)
2346 platform->secure_buffer.destroy(pdev, &platform->secure_buffer);
2348 gk20a_dbg_info("golden_image_size : %d",
2349 gr->ctx_vars.golden_image_size);
2351 err = gk20a_gr_alloc_ctx_buffer(pdev,
2352 &gr->global_ctx_buffer[GOLDEN_CTX],
2353 gr->ctx_vars.golden_image_size);
2357 gk20a_dbg_info("priv_access_map_size : %d",
2358 gr->ctx_vars.priv_access_map_size);
2360 err = gk20a_gr_alloc_ctx_buffer(pdev,
2361 &gr->global_ctx_buffer[PRIV_ACCESS_MAP],
2362 gr->ctx_vars.priv_access_map_size);
2367 gk20a_dbg_fn("done");
2371 gk20a_err(dev_from_gk20a(g), "fail");
2372 for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
2373 if (gr->global_ctx_buffer[i].destroy) {
2374 gr->global_ctx_buffer[i].destroy(pdev,
2375 &gr->global_ctx_buffer[i]);
2381 static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g)
2383 struct platform_device *pdev = g->dev;
2384 struct gr_gk20a *gr = &g->gr;
2385 DEFINE_DMA_ATTRS(attrs);
2388 dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2390 for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
2391 gr->global_ctx_buffer[i].destroy(pdev,
2392 &gr->global_ctx_buffer[i]);
2395 gk20a_dbg_fn("done");
2398 static int gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
2399 struct channel_gk20a *c)
2401 struct vm_gk20a *ch_vm = c->vm;
2402 u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
2403 u64 *g_bfr_size = c->ch_ctx.global_ctx_buffer_size;
2404 struct gr_gk20a *gr = &g->gr;
2405 struct sg_table *sgt;
2411 /* Circular Buffer */
2412 if (!c->vpr || (gr->global_ctx_buffer[CIRCULAR_VPR].sgt == NULL)) {
2413 sgt = gr->global_ctx_buffer[CIRCULAR].sgt;
2414 size = gr->global_ctx_buffer[CIRCULAR].size;
2416 sgt = gr->global_ctx_buffer[CIRCULAR_VPR].sgt;
2417 size = gr->global_ctx_buffer[CIRCULAR_VPR].size;
2420 gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size,
2421 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
2422 gk20a_mem_flag_none);
2425 g_bfr_va[CIRCULAR_VA] = gpu_va;
2426 g_bfr_size[CIRCULAR_VA] = size;
2428 /* Attribute Buffer */
2429 if (!c->vpr || (gr->global_ctx_buffer[ATTRIBUTE_VPR].sgt == NULL)) {
2430 sgt = gr->global_ctx_buffer[ATTRIBUTE].sgt;
2431 size = gr->global_ctx_buffer[ATTRIBUTE].size;
2433 sgt = gr->global_ctx_buffer[ATTRIBUTE_VPR].sgt;
2434 size = gr->global_ctx_buffer[ATTRIBUTE_VPR].size;
2437 gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size,
2438 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
2439 gk20a_mem_flag_none);
2442 g_bfr_va[ATTRIBUTE_VA] = gpu_va;
2443 g_bfr_size[ATTRIBUTE_VA] = size;
2446 if (!c->vpr || (gr->global_ctx_buffer[PAGEPOOL_VPR].sgt == NULL)) {
2447 sgt = gr->global_ctx_buffer[PAGEPOOL].sgt;
2448 size = gr->global_ctx_buffer[PAGEPOOL].size;
2450 sgt = gr->global_ctx_buffer[PAGEPOOL_VPR].sgt;
2451 size = gr->global_ctx_buffer[PAGEPOOL_VPR].size;
2454 gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size,
2455 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
2456 gk20a_mem_flag_none);
2459 g_bfr_va[PAGEPOOL_VA] = gpu_va;
2460 g_bfr_size[PAGEPOOL_VA] = size;
2463 sgt = gr->global_ctx_buffer[GOLDEN_CTX].sgt;
2464 size = gr->global_ctx_buffer[GOLDEN_CTX].size;
2465 gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size, 0,
2466 gk20a_mem_flag_none);
2469 g_bfr_va[GOLDEN_CTX_VA] = gpu_va;
2470 g_bfr_size[GOLDEN_CTX_VA] = size;
2472 /* Priv register Access Map */
2473 sgt = gr->global_ctx_buffer[PRIV_ACCESS_MAP].sgt;
2474 size = gr->global_ctx_buffer[PRIV_ACCESS_MAP].size;
2475 gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size, 0,
2476 gk20a_mem_flag_none);
2479 g_bfr_va[PRIV_ACCESS_MAP_VA] = gpu_va;
2480 g_bfr_size[PRIV_ACCESS_MAP_VA] = size;
2482 c->ch_ctx.global_ctx_buffer_mapped = true;
2486 for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
2488 gk20a_gmmu_unmap(ch_vm, g_bfr_va[i],
2489 gr->global_ctx_buffer[i].size,
2490 gk20a_mem_flag_none);
2497 static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c)
2499 struct vm_gk20a *ch_vm = c->vm;
2500 u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
2501 u64 *g_bfr_size = c->ch_ctx.global_ctx_buffer_size;
2506 for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
2508 gk20a_gmmu_unmap(ch_vm, g_bfr_va[i],
2510 gk20a_mem_flag_none);
2515 c->ch_ctx.global_ctx_buffer_mapped = false;
2518 static int gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
2519 struct channel_gk20a *c)
2521 struct gr_gk20a *gr = &g->gr;
2522 struct gr_ctx_desc *gr_ctx = &c->ch_ctx.gr_ctx;
2523 struct vm_gk20a *ch_vm = c->vm;
2524 struct device *d = dev_from_gk20a(g);
2525 struct sg_table *sgt;
2526 DEFINE_DMA_ATTRS(attrs);
2532 if (gr->ctx_vars.buffer_size == 0)
2535 /* alloc channel gr ctx buffer */
2536 gr->ctx_vars.buffer_size = gr->ctx_vars.golden_image_size;
2537 gr->ctx_vars.buffer_total_size = gr->ctx_vars.golden_image_size;
2539 gr_ctx->size = gr->ctx_vars.buffer_total_size;
2540 dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2541 gr_ctx->pages = dma_alloc_attrs(d, gr_ctx->size,
2542 &iova, GFP_KERNEL, &attrs);
2546 gr_ctx->iova = iova;
2547 err = gk20a_get_sgtable_from_pages(d, &sgt, gr_ctx->pages,
2548 gr_ctx->iova, gr_ctx->size);
2552 gr_ctx->gpu_va = gk20a_gmmu_map(ch_vm, &sgt, gr_ctx->size,
2553 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
2554 gk20a_mem_flag_none);
2555 if (!gr_ctx->gpu_va)
2558 gk20a_free_sgtable(&sgt);
2563 gk20a_free_sgtable(&sgt);
2565 dma_free_attrs(d, gr_ctx->size,
2566 gr_ctx->pages, gr_ctx->iova, &attrs);
2567 gr_ctx->pages = NULL;
2573 static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c)
2575 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
2576 struct vm_gk20a *ch_vm = c->vm;
2577 struct gk20a *g = c->g;
2578 struct device *d = dev_from_gk20a(g);
2579 DEFINE_DMA_ATTRS(attrs);
2583 if (!ch_ctx->gr_ctx.gpu_va)
2586 gk20a_gmmu_unmap(ch_vm, ch_ctx->gr_ctx.gpu_va,
2587 ch_ctx->gr_ctx.size, gk20a_mem_flag_none);
2588 dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2589 dma_free_attrs(d, ch_ctx->gr_ctx.size,
2590 ch_ctx->gr_ctx.pages, ch_ctx->gr_ctx.iova, &attrs);
2591 ch_ctx->gr_ctx.pages = NULL;
2592 ch_ctx->gr_ctx.iova = 0;
2595 static int gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
2596 struct channel_gk20a *c)
2598 struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2599 struct device *d = dev_from_gk20a(g);
2600 struct vm_gk20a *ch_vm = c->vm;
2601 DEFINE_DMA_ATTRS(attrs);
2602 struct sg_table *sgt;
2608 patch_ctx->size = 128 * sizeof(u32);
2609 dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2610 patch_ctx->pages = dma_alloc_attrs(d, patch_ctx->size,
2613 if (!patch_ctx->pages)
2616 patch_ctx->iova = iova;
2617 err = gk20a_get_sgtable_from_pages(d, &sgt, patch_ctx->pages,
2618 patch_ctx->iova, patch_ctx->size);
2622 patch_ctx->gpu_va = gk20a_gmmu_map(ch_vm, &sgt, patch_ctx->size,
2623 0, gk20a_mem_flag_none);
2624 if (!patch_ctx->gpu_va)
2625 goto err_free_sgtable;
2627 gk20a_free_sgtable(&sgt);
2629 gk20a_dbg_fn("done");
2633 gk20a_free_sgtable(&sgt);
2635 dma_free_attrs(d, patch_ctx->size,
2636 patch_ctx->pages, patch_ctx->iova, &attrs);
2637 patch_ctx->pages = NULL;
2638 patch_ctx->iova = 0;
2639 gk20a_err(dev_from_gk20a(g), "fail");
2643 static void gr_gk20a_unmap_channel_patch_ctx(struct channel_gk20a *c)
2645 struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2646 struct vm_gk20a *ch_vm = c->vm;
2650 if (patch_ctx->gpu_va)
2651 gk20a_gmmu_unmap(ch_vm, patch_ctx->gpu_va,
2652 patch_ctx->size, gk20a_mem_flag_none);
2653 patch_ctx->gpu_va = 0;
2654 patch_ctx->data_count = 0;
2657 static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c)
2659 struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2660 struct gk20a *g = c->g;
2661 struct device *d = dev_from_gk20a(g);
2662 DEFINE_DMA_ATTRS(attrs);
2666 gr_gk20a_unmap_channel_patch_ctx(c);
2668 if (patch_ctx->pages) {
2669 dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2670 dma_free_attrs(d, patch_ctx->size,
2671 patch_ctx->pages, patch_ctx->iova, &attrs);
2672 patch_ctx->pages = NULL;
2673 patch_ctx->iova = 0;
2677 void gk20a_free_channel_ctx(struct channel_gk20a *c)
2679 gr_gk20a_unmap_global_ctx_buffers(c);
2680 gr_gk20a_free_channel_patch_ctx(c);
2681 gr_gk20a_free_channel_gr_ctx(c);
2683 /* zcull_ctx, pm_ctx */
2685 memset(&c->ch_ctx, 0, sizeof(struct channel_ctx_gk20a));
2688 c->first_init = false;
2691 static bool gr_gk20a_is_valid_class(struct gk20a *g, u32 class_num)
2695 switch (class_num) {
2696 case KEPLER_COMPUTE_A:
2699 case KEPLER_DMA_COPY_A:
2710 int gk20a_alloc_obj_ctx(struct channel_gk20a *c,
2711 struct nvhost_alloc_obj_ctx_args *args)
2713 struct gk20a *g = c->g;
2714 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
2719 /* an address space needs to have been bound at this point.*/
2720 if (!gk20a_channel_as_bound(c)) {
2721 gk20a_err(dev_from_gk20a(g),
2722 "not bound to address space at time"
2723 " of grctx allocation");
2727 if (!g->ops.gr.is_valid_class(g, args->class_num)) {
2728 gk20a_err(dev_from_gk20a(g),
2729 "invalid obj class 0x%x", args->class_num);
2734 /* allocate gr ctx buffer */
2735 if (ch_ctx->gr_ctx.pages == NULL) {
2736 err = gr_gk20a_alloc_channel_gr_ctx(g, c);
2738 gk20a_err(dev_from_gk20a(g),
2739 "fail to allocate gr ctx buffer");
2742 c->obj_class = args->class_num;
2744 /*TBD: needs to be more subtle about which is being allocated
2745 * as some are allowed to be allocated along same channel */
2746 gk20a_err(dev_from_gk20a(g),
2747 "too many classes alloc'd on same channel");
2752 /* commit gr ctx buffer */
2753 err = gr_gk20a_commit_inst(c, ch_ctx->gr_ctx.gpu_va);
2755 gk20a_err(dev_from_gk20a(g),
2756 "fail to commit gr ctx buffer");
2760 /* allocate patch buffer */
2761 if (ch_ctx->patch_ctx.pages == NULL) {
2762 err = gr_gk20a_alloc_channel_patch_ctx(g, c);
2764 gk20a_err(dev_from_gk20a(g),
2765 "fail to allocate patch buffer");
2770 /* map global buffer to channel gpu_va and commit */
2771 if (!ch_ctx->global_ctx_buffer_mapped) {
2772 err = gr_gk20a_map_global_ctx_buffers(g, c);
2774 gk20a_err(dev_from_gk20a(g),
2775 "fail to map global ctx buffer");
2778 gr_gk20a_elpg_protected_call(g,
2779 gr_gk20a_commit_global_ctx_buffers(g, c, true));
2782 /* tweak any perf parameters per-context here */
2783 if (args->class_num == KEPLER_COMPUTE_A) {
2785 u32 tex_lock_disable_mask =
2786 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_m() |
2787 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tile_m() |
2788 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_phase_m() |
2789 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tex_m() |
2790 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_timeout_m() |
2791 gr_gpcs_tpcs_sm_sch_texlock_dot_t_unlock_m();
2793 u32 texlock = gk20a_readl(g, gr_gpcs_tpcs_sm_sch_texlock_r());
2795 texlock = (texlock & ~tex_lock_disable_mask) |
2796 (gr_gpcs_tpcs_sm_sch_texlock_tex_hash_disable_f() |
2797 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tile_disable_f() |
2798 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_phase_disable_f() |
2799 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tex_disable_f() |
2800 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_timeout_disable_f() |
2801 gr_gpcs_tpcs_sm_sch_texlock_dot_t_unlock_disable_f());
2803 begin_err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
2806 err = gr_gk20a_ctx_patch_write(g, ch_ctx,
2807 gr_gpcs_tpcs_sm_sch_texlock_r(),
2810 if ((begin_err || err)) {
2811 gk20a_err(dev_from_gk20a(g),
2812 "failed to set texlock for compute class");
2815 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
2818 /* init golden image, ELPG enabled after this is done */
2819 err = gr_gk20a_init_golden_ctx_image(g, c);
2821 gk20a_err(dev_from_gk20a(g),
2822 "fail to init golden ctx image");
2826 /* load golden image */
2827 if (!c->first_init) {
2828 err = gr_gk20a_elpg_protected_call(g,
2829 gr_gk20a_load_golden_ctx_image(g, c));
2831 gk20a_err(dev_from_gk20a(g),
2832 "fail to load golden ctx image");
2835 c->first_init = true;
2840 gk20a_dbg_fn("done");
2843 /* 1. gr_ctx, patch_ctx and global ctx buffer mapping
2844 can be reused so no need to release them.
2845 2. golden image init and load is a one time thing so if
2846 they pass, no need to undo. */
2847 gk20a_err(dev_from_gk20a(g), "fail");
2851 int gk20a_free_obj_ctx(struct channel_gk20a *c,
2852 struct nvhost_free_obj_ctx_args *args)
2854 unsigned long timeout = gk20a_get_gr_idle_timeout(c->g);
2858 if (c->num_objects == 0)
2863 if (c->num_objects == 0) {
2864 c->first_init = false;
2865 gk20a_disable_channel(c,
2868 gr_gk20a_unmap_channel_patch_ctx(c);
2874 static void gk20a_remove_gr_support(struct gr_gk20a *gr)
2876 struct gk20a *g = gr->g;
2877 struct device *d = dev_from_gk20a(g);
2878 DEFINE_DMA_ATTRS(attrs);
2882 gr_gk20a_free_global_ctx_buffers(g);
2884 dma_free_coherent(d, gr->mmu_wr_mem.size,
2885 gr->mmu_wr_mem.cpuva, gr->mmu_wr_mem.iova);
2886 gr->mmu_wr_mem.cpuva = NULL;
2887 gr->mmu_wr_mem.iova = 0;
2888 dma_free_coherent(d, gr->mmu_rd_mem.size,
2889 gr->mmu_rd_mem.cpuva, gr->mmu_rd_mem.iova);
2890 gr->mmu_rd_mem.cpuva = NULL;
2891 gr->mmu_rd_mem.iova = 0;
2893 dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2894 dma_free_attrs(d, gr->compbit_store.size, gr->compbit_store.pages,
2895 gr->compbit_store.base_iova, &attrs);
2897 memset(&gr->mmu_wr_mem, 0, sizeof(struct mmu_desc));
2898 memset(&gr->mmu_rd_mem, 0, sizeof(struct mmu_desc));
2899 memset(&gr->compbit_store, 0, sizeof(struct compbit_store_desc));
2901 kfree(gr->gpc_tpc_count);
2902 kfree(gr->gpc_zcb_count);
2903 kfree(gr->gpc_ppc_count);
2904 kfree(gr->pes_tpc_count[0]);
2905 kfree(gr->pes_tpc_count[1]);
2906 kfree(gr->pes_tpc_mask[0]);
2907 kfree(gr->pes_tpc_mask[1]);
2908 kfree(gr->gpc_skip_mask);
2909 kfree(gr->map_tiles);
2910 gr->gpc_tpc_count = NULL;
2911 gr->gpc_zcb_count = NULL;
2912 gr->gpc_ppc_count = NULL;
2913 gr->pes_tpc_count[0] = NULL;
2914 gr->pes_tpc_count[1] = NULL;
2915 gr->pes_tpc_mask[0] = NULL;
2916 gr->pes_tpc_mask[1] = NULL;
2917 gr->gpc_skip_mask = NULL;
2918 gr->map_tiles = NULL;
2920 kfree(gr->ctx_vars.ucode.fecs.inst.l);
2921 kfree(gr->ctx_vars.ucode.fecs.data.l);
2922 kfree(gr->ctx_vars.ucode.gpccs.inst.l);
2923 kfree(gr->ctx_vars.ucode.gpccs.data.l);
2924 kfree(gr->ctx_vars.sw_bundle_init.l);
2925 kfree(gr->ctx_vars.sw_method_init.l);
2926 kfree(gr->ctx_vars.sw_ctx_load.l);
2927 kfree(gr->ctx_vars.sw_non_ctx_load.l);
2928 kfree(gr->ctx_vars.ctxsw_regs.sys.l);
2929 kfree(gr->ctx_vars.ctxsw_regs.gpc.l);
2930 kfree(gr->ctx_vars.ctxsw_regs.tpc.l);
2931 kfree(gr->ctx_vars.ctxsw_regs.zcull_gpc.l);
2932 kfree(gr->ctx_vars.ctxsw_regs.ppc.l);
2933 kfree(gr->ctx_vars.ctxsw_regs.pm_sys.l);
2934 kfree(gr->ctx_vars.ctxsw_regs.pm_gpc.l);
2935 kfree(gr->ctx_vars.ctxsw_regs.pm_tpc.l);
2937 kfree(gr->ctx_vars.local_golden_image);
2938 gr->ctx_vars.local_golden_image = NULL;
2940 gk20a_allocator_destroy(&gr->comp_tags);
2943 static void gr_gk20a_bundle_cb_defaults(struct gk20a *g)
2945 struct gr_gk20a *gr = &g->gr;
2947 gr->bundle_cb_default_size =
2948 gr_scc_bundle_cb_size_div_256b__prod_v();
2949 gr->min_gpm_fifo_depth =
2950 gr_pd_ab_dist_cfg2_state_limit_min_gpm_fifo_depths_v();
2951 gr->bundle_cb_token_limit =
2952 gr_pd_ab_dist_cfg2_token_limit_init_v();
2955 static int gr_gk20a_init_gr_config(struct gk20a *g, struct gr_gk20a *gr)
2957 u32 gpc_index, pes_index;
2960 u32 pes_heavy_index;
2961 u32 gpc_new_skip_mask;
2964 tmp = gk20a_readl(g, pri_ringmaster_enum_fbp_r());
2965 gr->num_fbps = pri_ringmaster_enum_fbp_count_v(tmp);
2967 tmp = gk20a_readl(g, top_num_gpcs_r());
2968 gr->max_gpc_count = top_num_gpcs_value_v(tmp);
2970 tmp = gk20a_readl(g, top_num_fbps_r());
2971 gr->max_fbps_count = top_num_fbps_value_v(tmp);
2973 tmp = gk20a_readl(g, top_tpc_per_gpc_r());
2974 gr->max_tpc_per_gpc_count = top_tpc_per_gpc_value_v(tmp);
2976 gr->max_tpc_count = gr->max_gpc_count * gr->max_tpc_per_gpc_count;
2978 tmp = gk20a_readl(g, top_num_fbps_r());
2979 gr->sys_count = top_num_fbps_value_v(tmp);
2981 tmp = gk20a_readl(g, pri_ringmaster_enum_gpc_r());
2982 gr->gpc_count = pri_ringmaster_enum_gpc_count_v(tmp);
2984 gr->pe_count_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
2985 gr->max_zcull_per_gpc_count = proj_scal_litter_num_zcull_banks_v();
2987 if (!gr->gpc_count) {
2988 gk20a_err(dev_from_gk20a(g), "gpc_count==0!");
2992 gr->gpc_tpc_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2993 gr->gpc_zcb_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2994 gr->gpc_ppc_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2995 gr->pes_tpc_count[0] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2996 gr->pes_tpc_count[1] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2997 gr->pes_tpc_mask[0] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2998 gr->pes_tpc_mask[1] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
3000 kzalloc(gr_pd_dist_skip_table__size_1_v() * 4 * sizeof(u32),
3003 if (!gr->gpc_tpc_count || !gr->gpc_zcb_count || !gr->gpc_ppc_count ||
3004 !gr->pes_tpc_count[0] || !gr->pes_tpc_count[1] ||
3005 !gr->pes_tpc_mask[0] || !gr->pes_tpc_mask[1] || !gr->gpc_skip_mask)
3009 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3010 tmp = gk20a_readl(g, gr_gpc0_fs_gpc_r());
3012 gr->gpc_tpc_count[gpc_index] =
3013 gr_gpc0_fs_gpc_num_available_tpcs_v(tmp);
3014 gr->tpc_count += gr->gpc_tpc_count[gpc_index];
3016 gr->gpc_zcb_count[gpc_index] =
3017 gr_gpc0_fs_gpc_num_available_zculls_v(tmp);
3018 gr->zcb_count += gr->gpc_zcb_count[gpc_index];
3020 gr->gpc_ppc_count[gpc_index] = gr->pe_count_per_gpc;
3021 gr->ppc_count += gr->gpc_ppc_count[gpc_index];
3022 for (pes_index = 0; pes_index < gr->pe_count_per_gpc; pes_index++) {
3024 tmp = gk20a_readl(g,
3025 gr_gpc0_gpm_pd_pes_tpc_id_mask_r(pes_index) +
3026 gpc_index * proj_gpc_stride_v());
3028 pes_tpc_mask = gr_gpc0_gpm_pd_pes_tpc_id_mask_mask_v(tmp);
3029 pes_tpc_count = count_bits(pes_tpc_mask);
3031 gr->pes_tpc_count[pes_index][gpc_index] = pes_tpc_count;
3032 gr->pes_tpc_mask[pes_index][gpc_index] = pes_tpc_mask;
3035 gpc_new_skip_mask = 0;
3036 if (gr->pes_tpc_count[0][gpc_index] +
3037 gr->pes_tpc_count[1][gpc_index] == 5) {
3039 gr->pes_tpc_count[0][gpc_index] >
3040 gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
3043 gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
3044 (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
3045 (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
3047 } else if ((gr->pes_tpc_count[0][gpc_index] +
3048 gr->pes_tpc_count[1][gpc_index] == 4) &&
3049 (gr->pes_tpc_count[0][gpc_index] !=
3050 gr->pes_tpc_count[1][gpc_index])) {
3052 gr->pes_tpc_count[0][gpc_index] >
3053 gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
3056 gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
3057 (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
3058 (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
3060 gr->gpc_skip_mask[gpc_index] = gpc_new_skip_mask;
3063 gk20a_dbg_info("fbps: %d", gr->num_fbps);
3064 gk20a_dbg_info("max_gpc_count: %d", gr->max_gpc_count);
3065 gk20a_dbg_info("max_fbps_count: %d", gr->max_fbps_count);
3066 gk20a_dbg_info("max_tpc_per_gpc_count: %d", gr->max_tpc_per_gpc_count);
3067 gk20a_dbg_info("max_zcull_per_gpc_count: %d", gr->max_zcull_per_gpc_count);
3068 gk20a_dbg_info("max_tpc_count: %d", gr->max_tpc_count);
3069 gk20a_dbg_info("sys_count: %d", gr->sys_count);
3070 gk20a_dbg_info("gpc_count: %d", gr->gpc_count);
3071 gk20a_dbg_info("pe_count_per_gpc: %d", gr->pe_count_per_gpc);
3072 gk20a_dbg_info("tpc_count: %d", gr->tpc_count);
3073 gk20a_dbg_info("ppc_count: %d", gr->ppc_count);
3075 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3076 gk20a_dbg_info("gpc_tpc_count[%d] : %d",
3077 gpc_index, gr->gpc_tpc_count[gpc_index]);
3078 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3079 gk20a_dbg_info("gpc_zcb_count[%d] : %d",
3080 gpc_index, gr->gpc_zcb_count[gpc_index]);
3081 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3082 gk20a_dbg_info("gpc_ppc_count[%d] : %d",
3083 gpc_index, gr->gpc_ppc_count[gpc_index]);
3084 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3085 gk20a_dbg_info("gpc_skip_mask[%d] : %d",
3086 gpc_index, gr->gpc_skip_mask[gpc_index]);
3087 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3089 pes_index < gr->pe_count_per_gpc;
3091 gk20a_dbg_info("pes_tpc_count[%d][%d] : %d",
3092 pes_index, gpc_index,
3093 gr->pes_tpc_count[pes_index][gpc_index]);
3095 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3097 pes_index < gr->pe_count_per_gpc;
3099 gk20a_dbg_info("pes_tpc_mask[%d][%d] : %d",
3100 pes_index, gpc_index,
3101 gr->pes_tpc_mask[pes_index][gpc_index]);
3103 g->ops.gr.bundle_cb_defaults(g);
3104 g->ops.gr.cb_size_default(g);
3105 g->ops.gr.calc_global_ctx_buffer_size(g);
3106 gr->timeslice_mode = gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v();
3108 gk20a_dbg_info("bundle_cb_default_size: %d",
3109 gr->bundle_cb_default_size);
3110 gk20a_dbg_info("min_gpm_fifo_depth: %d", gr->min_gpm_fifo_depth);
3111 gk20a_dbg_info("bundle_cb_token_limit: %d", gr->bundle_cb_token_limit);
3112 gk20a_dbg_info("attrib_cb_default_size: %d",
3113 gr->attrib_cb_default_size);
3114 gk20a_dbg_info("attrib_cb_size: %d", gr->attrib_cb_size);
3115 gk20a_dbg_info("alpha_cb_default_size: %d", gr->alpha_cb_default_size);
3116 gk20a_dbg_info("alpha_cb_size: %d", gr->alpha_cb_size);
3117 gk20a_dbg_info("timeslice_mode: %d", gr->timeslice_mode);
3125 static int gr_gk20a_init_mmu_sw(struct gk20a *g, struct gr_gk20a *gr)
3127 struct device *d = dev_from_gk20a(g);
3130 gr->mmu_wr_mem_size = gr->mmu_rd_mem_size = 0x1000;
3132 gr->mmu_wr_mem.size = gr->mmu_wr_mem_size;
3133 gr->mmu_wr_mem.cpuva = dma_zalloc_coherent(d, gr->mmu_wr_mem_size,
3135 if (!gr->mmu_wr_mem.cpuva)
3138 gr->mmu_wr_mem.iova = iova;
3140 gr->mmu_rd_mem.size = gr->mmu_rd_mem_size;
3141 gr->mmu_rd_mem.cpuva = dma_zalloc_coherent(d, gr->mmu_rd_mem_size,
3143 if (!gr->mmu_rd_mem.cpuva)
3144 goto err_free_wr_mem;
3146 gr->mmu_rd_mem.iova = iova;
3150 dma_free_coherent(d, gr->mmu_wr_mem.size,
3151 gr->mmu_wr_mem.cpuva, gr->mmu_wr_mem.iova);
3152 gr->mmu_wr_mem.cpuva = NULL;
3153 gr->mmu_wr_mem.iova = 0;
3158 static u32 prime_set[18] = {
3159 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61 };
3161 static int gr_gk20a_init_map_tiles(struct gk20a *g, struct gr_gk20a *gr)
3165 s32 *init_frac = NULL;
3166 s32 *init_err = NULL;
3167 s32 *run_err = NULL;
3168 s32 *sorted_num_tpcs = NULL;
3169 s32 *sorted_to_unsorted_gpc_map = NULL;
3173 u32 max_tpc_count = 0;
3177 bool delete_map = false;
3181 init_frac = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3182 init_err = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3183 run_err = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3185 kzalloc(proj_scal_max_gpcs_v() *
3186 proj_scal_max_tpc_per_gpc_v() * sizeof(s32),
3188 sorted_to_unsorted_gpc_map =
3189 kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3191 if (!(init_frac && init_err && run_err && sorted_num_tpcs &&
3192 sorted_to_unsorted_gpc_map)) {
3197 gr->map_row_offset = INVALID_SCREEN_TILE_ROW_OFFSET;
3199 if (gr->tpc_count == 3)
3200 gr->map_row_offset = 2;
3201 else if (gr->tpc_count < 3)
3202 gr->map_row_offset = 1;
3204 gr->map_row_offset = 3;
3206 for (index = 1; index < 18; index++) {
3207 u32 prime = prime_set[index];
3208 if ((gr->tpc_count % prime) != 0) {
3209 gr->map_row_offset = prime;
3215 switch (gr->tpc_count) {
3217 gr->map_row_offset = 6;
3220 gr->map_row_offset = 5;
3223 gr->map_row_offset = 2;
3226 gr->map_row_offset = 7;
3229 gr->map_row_offset = 6;
3233 gr->map_row_offset = 1;
3239 if (gr->map_tiles) {
3240 if (gr->map_tile_count != gr->tpc_count)
3243 for (tile_count = 0; tile_count < gr->map_tile_count; tile_count++) {
3244 if ((u32)gr->map_tiles[tile_count] >= gr->tpc_count)
3249 kfree(gr->map_tiles);
3250 gr->map_tiles = NULL;
3251 gr->map_tile_count = 0;
3255 if (gr->map_tiles == NULL) {
3256 gr->map_tile_count = proj_scal_max_gpcs_v();
3258 gr->map_tiles = kzalloc(proj_scal_max_gpcs_v() * sizeof(u8), GFP_KERNEL);
3259 if (gr->map_tiles == NULL) {
3264 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3265 sorted_num_tpcs[gpc_index] = gr->gpc_tpc_count[gpc_index];
3266 sorted_to_unsorted_gpc_map[gpc_index] = gpc_index;
3270 while (!gpc_sorted) {
3272 for (gpc_index = 0; gpc_index < gr->gpc_count - 1; gpc_index++) {
3273 if (sorted_num_tpcs[gpc_index + 1] > sorted_num_tpcs[gpc_index]) {
3275 swap = sorted_num_tpcs[gpc_index];
3276 sorted_num_tpcs[gpc_index] = sorted_num_tpcs[gpc_index + 1];
3277 sorted_num_tpcs[gpc_index + 1] = swap;
3278 swap = sorted_to_unsorted_gpc_map[gpc_index];
3279 sorted_to_unsorted_gpc_map[gpc_index] =
3280 sorted_to_unsorted_gpc_map[gpc_index + 1];
3281 sorted_to_unsorted_gpc_map[gpc_index + 1] = swap;
3286 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3287 if (gr->gpc_tpc_count[gpc_index] > max_tpc_count)
3288 max_tpc_count = gr->gpc_tpc_count[gpc_index];
3290 mul_factor = gr->gpc_count * max_tpc_count;
3291 if (mul_factor & 0x1)
3296 comm_denom = gr->gpc_count * max_tpc_count * mul_factor;
3298 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3299 num_tpc = sorted_num_tpcs[gpc_index];
3301 init_frac[gpc_index] = num_tpc * gr->gpc_count * mul_factor;
3304 init_err[gpc_index] = gpc_index * max_tpc_count * mul_factor - comm_denom/2;
3306 init_err[gpc_index] = 0;
3308 run_err[gpc_index] = init_frac[gpc_index] + init_err[gpc_index];
3311 while (gpc_mark < gr->tpc_count) {
3312 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3313 if ((run_err[gpc_index] * 2) >= comm_denom) {
3314 gr->map_tiles[gpc_mark++] = (u8)sorted_to_unsorted_gpc_map[gpc_index];
3315 run_err[gpc_index] += init_frac[gpc_index] - comm_denom;
3317 run_err[gpc_index] += init_frac[gpc_index];
3326 kfree(sorted_num_tpcs);
3327 kfree(sorted_to_unsorted_gpc_map);
3330 gk20a_err(dev_from_gk20a(g), "fail");
3332 gk20a_dbg_fn("done");
3337 static int gr_gk20a_init_zcull(struct gk20a *g, struct gr_gk20a *gr)
3339 struct gr_zcull_gk20a *zcull = &gr->zcull;
3341 zcull->aliquot_width = gr->tpc_count * 16;
3342 zcull->aliquot_height = 16;
3344 zcull->width_align_pixels = gr->tpc_count * 16;
3345 zcull->height_align_pixels = 32;
3347 zcull->aliquot_size =
3348 zcull->aliquot_width * zcull->aliquot_height;
3350 /* assume no floor sweeping since we only have 1 tpc in 1 gpc */
3351 zcull->pixel_squares_by_aliquots =
3352 gr->zcb_count * 16 * 16 * gr->tpc_count /
3353 (gr->gpc_count * gr->gpc_tpc_count[0]);
3355 zcull->total_aliquots =
3356 gr_gpc0_zcull_total_ram_size_num_aliquots_f(
3357 gk20a_readl(g, gr_gpc0_zcull_total_ram_size_r()));
3362 u32 gr_gk20a_get_ctxsw_zcull_size(struct gk20a *g, struct gr_gk20a *gr)
3364 /* assuming gr has already been initialized */
3365 return gr->ctx_vars.zcull_ctxsw_image_size;
3368 int gr_gk20a_bind_ctxsw_zcull(struct gk20a *g, struct gr_gk20a *gr,
3369 struct channel_gk20a *c, u64 zcull_va, u32 mode)
3371 struct zcull_ctx_desc *zcull_ctx = &c->ch_ctx.zcull_ctx;
3373 zcull_ctx->ctx_sw_mode = mode;
3374 zcull_ctx->gpu_va = zcull_va;
3376 /* TBD: don't disable channel in sw method processing */
3377 return gr_gk20a_ctx_zcull_setup(g, c, true);
3380 int gr_gk20a_get_zcull_info(struct gk20a *g, struct gr_gk20a *gr,
3381 struct gr_zcull_info *zcull_params)
3383 struct gr_zcull_gk20a *zcull = &gr->zcull;
3385 zcull_params->width_align_pixels = zcull->width_align_pixels;
3386 zcull_params->height_align_pixels = zcull->height_align_pixels;
3387 zcull_params->pixel_squares_by_aliquots =
3388 zcull->pixel_squares_by_aliquots;
3389 zcull_params->aliquot_total = zcull->total_aliquots;
3391 zcull_params->region_byte_multiplier =
3392 gr->gpc_count * gr_zcull_bytes_per_aliquot_per_gpu_v();
3393 zcull_params->region_header_size =
3394 proj_scal_litter_num_gpcs_v() *
3395 gr_zcull_save_restore_header_bytes_per_gpc_v();
3397 zcull_params->subregion_header_size =
3398 proj_scal_litter_num_gpcs_v() *
3399 gr_zcull_save_restore_subregion_header_bytes_per_gpc_v();
3401 zcull_params->subregion_width_align_pixels =
3402 gr->tpc_count * gr_gpc0_zcull_zcsize_width_subregion__multiple_v();
3403 zcull_params->subregion_height_align_pixels =
3404 gr_gpc0_zcull_zcsize_height_subregion__multiple_v();
3405 zcull_params->subregion_count = gr_zcull_subregion_qty_v();
3410 static int gr_gk20a_add_zbc_color(struct gk20a *g, struct gr_gk20a *gr,
3411 struct zbc_entry *color_val, u32 index)
3413 struct fifo_gk20a *f = &g->fifo;
3414 struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
3416 unsigned long end_jiffies = jiffies +
3417 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3420 ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3422 gk20a_err(dev_from_gk20a(g),
3423 "failed to disable gr engine activity\n");
3427 ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
3429 gk20a_err(dev_from_gk20a(g),
3430 "failed to idle graphics\n");
3434 /* update l2 table */
3435 g->ops.ltc.set_zbc_color_entry(g, color_val, index);
3437 /* update ds table */
3438 gk20a_writel(g, gr_ds_zbc_color_r_r(),
3439 gr_ds_zbc_color_r_val_f(color_val->color_ds[0]));
3440 gk20a_writel(g, gr_ds_zbc_color_g_r(),
3441 gr_ds_zbc_color_g_val_f(color_val->color_ds[1]));
3442 gk20a_writel(g, gr_ds_zbc_color_b_r(),
3443 gr_ds_zbc_color_b_val_f(color_val->color_ds[2]));
3444 gk20a_writel(g, gr_ds_zbc_color_a_r(),
3445 gr_ds_zbc_color_a_val_f(color_val->color_ds[3]));
3447 gk20a_writel(g, gr_ds_zbc_color_fmt_r(),
3448 gr_ds_zbc_color_fmt_val_f(color_val->format));
3450 gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3451 gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
3453 /* trigger the write */
3454 gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3455 gr_ds_zbc_tbl_ld_select_c_f() |
3456 gr_ds_zbc_tbl_ld_action_write_f() |
3457 gr_ds_zbc_tbl_ld_trigger_active_f());
3459 /* update local copy */
3460 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3461 gr->zbc_col_tbl[index].color_l2[i] = color_val->color_l2[i];
3462 gr->zbc_col_tbl[index].color_ds[i] = color_val->color_ds[i];
3464 gr->zbc_col_tbl[index].format = color_val->format;
3465 gr->zbc_col_tbl[index].ref_cnt++;
3468 ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3470 gk20a_err(dev_from_gk20a(g),
3471 "failed to enable gr engine activity\n");
3477 static int gr_gk20a_add_zbc_depth(struct gk20a *g, struct gr_gk20a *gr,
3478 struct zbc_entry *depth_val, u32 index)
3480 struct fifo_gk20a *f = &g->fifo;
3481 struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
3482 unsigned long end_jiffies = jiffies +
3483 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3486 ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3488 gk20a_err(dev_from_gk20a(g),
3489 "failed to disable gr engine activity\n");
3493 ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
3495 gk20a_err(dev_from_gk20a(g),
3496 "failed to idle graphics\n");
3500 /* update l2 table */
3501 g->ops.ltc.set_zbc_depth_entry(g, depth_val, index);
3503 /* update ds table */
3504 gk20a_writel(g, gr_ds_zbc_z_r(),
3505 gr_ds_zbc_z_val_f(depth_val->depth));
3507 gk20a_writel(g, gr_ds_zbc_z_fmt_r(),
3508 gr_ds_zbc_z_fmt_val_f(depth_val->format));
3510 gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3511 gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
3513 /* trigger the write */
3514 gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3515 gr_ds_zbc_tbl_ld_select_z_f() |
3516 gr_ds_zbc_tbl_ld_action_write_f() |
3517 gr_ds_zbc_tbl_ld_trigger_active_f());
3519 /* update local copy */
3520 gr->zbc_dep_tbl[index].depth = depth_val->depth;
3521 gr->zbc_dep_tbl[index].format = depth_val->format;
3522 gr->zbc_dep_tbl[index].ref_cnt++;
3525 ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3527 gk20a_err(dev_from_gk20a(g),
3528 "failed to enable gr engine activity\n");
3534 void gr_gk20a_pmu_save_zbc(struct gk20a *g, u32 entries)
3536 struct fifo_gk20a *f = &g->fifo;
3537 struct fifo_engine_info_gk20a *gr_info =
3538 f->engine_info + ENGINE_GR_GK20A;
3539 unsigned long end_jiffies = jiffies +
3540 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3543 ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3545 gk20a_err(dev_from_gk20a(g),
3546 "failed to disable gr engine activity\n");
3550 ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
3552 gk20a_err(dev_from_gk20a(g),
3553 "failed to idle graphics\n");
3558 gk20a_pmu_save_zbc(g, entries);
3561 ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3563 gk20a_err(dev_from_gk20a(g),
3564 "failed to enable gr engine activity\n");
3570 int gr_gk20a_add_zbc(struct gk20a *g, struct gr_gk20a *gr,
3571 struct zbc_entry *zbc_val)
3573 struct zbc_color_table *c_tbl;
3574 struct zbc_depth_table *d_tbl;
3575 u32 i, ret = -ENOMEM;
3579 /* no endian swap ? */
3581 mutex_lock(&gr->zbc_lock);
3582 switch (zbc_val->type) {
3583 case GK20A_ZBC_TYPE_COLOR:
3584 /* search existing tables */
3585 for (i = 0; i < gr->max_used_color_index; i++) {
3587 c_tbl = &gr->zbc_col_tbl[i];
3589 if (c_tbl->ref_cnt && c_tbl->format == zbc_val->format &&
3590 memcmp(c_tbl->color_ds, zbc_val->color_ds,
3591 sizeof(zbc_val->color_ds)) == 0) {
3593 if (memcmp(c_tbl->color_l2, zbc_val->color_l2,
3594 sizeof(zbc_val->color_l2))) {
3595 gk20a_err(dev_from_gk20a(g),
3596 "zbc l2 and ds color don't match with existing entries");
3608 gr->max_used_color_index < GK20A_ZBC_TABLE_SIZE) {
3611 &gr->zbc_col_tbl[gr->max_used_color_index];
3612 WARN_ON(c_tbl->ref_cnt != 0);
3614 ret = gr_gk20a_add_zbc_color(g, gr,
3615 zbc_val, gr->max_used_color_index);
3618 gr->max_used_color_index++;
3621 case GK20A_ZBC_TYPE_DEPTH:
3622 /* search existing tables */
3623 for (i = 0; i < gr->max_used_depth_index; i++) {
3625 d_tbl = &gr->zbc_dep_tbl[i];
3627 if (d_tbl->ref_cnt &&
3628 d_tbl->depth == zbc_val->depth &&
3629 d_tbl->format == zbc_val->format) {
3638 gr->max_used_depth_index < GK20A_ZBC_TABLE_SIZE) {
3641 &gr->zbc_dep_tbl[gr->max_used_depth_index];
3642 WARN_ON(d_tbl->ref_cnt != 0);
3644 ret = gr_gk20a_add_zbc_depth(g, gr,
3645 zbc_val, gr->max_used_depth_index);
3648 gr->max_used_depth_index++;
3652 gk20a_err(dev_from_gk20a(g),
3653 "invalid zbc table type %d", zbc_val->type);
3658 if (!added && ret == 0) {
3659 /* update zbc for elpg only when new entry is added */
3660 entries = max(gr->max_used_color_index,
3661 gr->max_used_depth_index);
3662 gr_gk20a_pmu_save_zbc(g, entries);
3666 mutex_unlock(&gr->zbc_lock);
3670 /* get a zbc table entry specified by index
3671 * return table size when type is invalid */
3672 int gr_gk20a_query_zbc(struct gk20a *g, struct gr_gk20a *gr,
3673 struct zbc_query_params *query_params)
3675 u32 index = query_params->index_size;
3678 switch (query_params->type) {
3679 case GK20A_ZBC_TYPE_INVALID:
3680 query_params->index_size = GK20A_ZBC_TABLE_SIZE;
3682 case GK20A_ZBC_TYPE_COLOR:
3683 if (index >= GK20A_ZBC_TABLE_SIZE) {
3684 gk20a_err(dev_from_gk20a(g),
3685 "invalid zbc color table index\n");
3688 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3689 query_params->color_l2[i] =
3690 gr->zbc_col_tbl[index].color_l2[i];
3691 query_params->color_ds[i] =
3692 gr->zbc_col_tbl[index].color_ds[i];
3694 query_params->format = gr->zbc_col_tbl[index].format;
3695 query_params->ref_cnt = gr->zbc_col_tbl[index].ref_cnt;
3697 case GK20A_ZBC_TYPE_DEPTH:
3698 if (index >= GK20A_ZBC_TABLE_SIZE) {
3699 gk20a_err(dev_from_gk20a(g),
3700 "invalid zbc depth table index\n");
3703 query_params->depth = gr->zbc_dep_tbl[index].depth;
3704 query_params->format = gr->zbc_dep_tbl[index].format;
3705 query_params->ref_cnt = gr->zbc_dep_tbl[index].ref_cnt;
3708 gk20a_err(dev_from_gk20a(g),
3709 "invalid zbc table type\n");
3716 int gr_gk20a_load_zbc_table(struct gk20a *g, struct gr_gk20a *gr)
3720 mutex_init(&gr->zbc_lock);
3721 for (i = 0; i < gr->max_used_color_index; i++) {
3722 struct zbc_color_table *c_tbl = &gr->zbc_col_tbl[i];
3723 struct zbc_entry zbc_val;
3725 zbc_val.type = GK20A_ZBC_TYPE_COLOR;
3726 memcpy(zbc_val.color_ds,
3727 c_tbl->color_ds, sizeof(zbc_val.color_ds));
3728 memcpy(zbc_val.color_l2,
3729 c_tbl->color_l2, sizeof(zbc_val.color_l2));
3730 zbc_val.format = c_tbl->format;
3732 ret = gr_gk20a_add_zbc_color(g, gr, &zbc_val, i);
3737 for (i = 0; i < gr->max_used_depth_index; i++) {
3738 struct zbc_depth_table *d_tbl = &gr->zbc_dep_tbl[i];
3739 struct zbc_entry zbc_val;
3741 zbc_val.type = GK20A_ZBC_TYPE_DEPTH;
3742 zbc_val.depth = d_tbl->depth;
3743 zbc_val.format = d_tbl->format;
3745 ret = gr_gk20a_add_zbc_depth(g, gr, &zbc_val, i);
3752 int gr_gk20a_load_zbc_default_table(struct gk20a *g, struct gr_gk20a *gr)
3754 struct zbc_entry zbc_val;
3757 /* load default color table */
3758 zbc_val.type = GK20A_ZBC_TYPE_COLOR;
3760 zbc_val.format = gr_ds_zbc_color_fmt_val_zero_v();
3761 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3762 zbc_val.color_ds[i] = 0;
3763 zbc_val.color_l2[i] = 0;
3765 err = gr_gk20a_add_zbc(g, gr, &zbc_val);
3767 zbc_val.format = gr_ds_zbc_color_fmt_val_unorm_one_v();
3768 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3769 zbc_val.color_ds[i] = 0xffffffff;
3770 zbc_val.color_l2[i] = 0x3f800000;
3772 err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3774 zbc_val.format = gr_ds_zbc_color_fmt_val_rf32_gf32_bf32_af32_v();
3775 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3776 zbc_val.color_ds[i] = 0;
3777 zbc_val.color_l2[i] = 0;
3779 err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3781 zbc_val.format = gr_ds_zbc_color_fmt_val_rf32_gf32_bf32_af32_v();
3782 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3783 zbc_val.color_ds[i] = 0x3f800000;
3784 zbc_val.color_l2[i] = 0x3f800000;
3786 err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3789 gr->max_default_color_index = 4;
3791 gk20a_err(dev_from_gk20a(g),
3792 "fail to load default zbc color table\n");
3796 /* load default depth table */
3797 zbc_val.type = GK20A_ZBC_TYPE_DEPTH;
3799 zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
3801 err = gr_gk20a_add_zbc(g, gr, &zbc_val);
3803 zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
3804 zbc_val.depth = 0x3f800000;
3805 err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3808 gr->max_default_depth_index = 2;
3810 gk20a_err(dev_from_gk20a(g),
3811 "fail to load default zbc depth table\n");
3818 int gk20a_gr_zbc_set_table(struct gk20a *g, struct gr_gk20a *gr,
3819 struct zbc_entry *zbc_val)
3823 return gr_gk20a_elpg_protected_call(g,
3824 gr_gk20a_add_zbc(g, gr, zbc_val));
3827 void gr_gk20a_init_blcg_mode(struct gk20a *g, u32 mode, u32 engine)
3831 gate_ctrl = gk20a_readl(g, therm_gate_ctrl_r(engine));
3835 gate_ctrl = set_field(gate_ctrl,
3836 therm_gate_ctrl_blk_clk_m(),
3837 therm_gate_ctrl_blk_clk_run_f());
3840 gate_ctrl = set_field(gate_ctrl,
3841 therm_gate_ctrl_blk_clk_m(),
3842 therm_gate_ctrl_blk_clk_auto_f());
3845 gk20a_err(dev_from_gk20a(g),
3846 "invalid blcg mode %d", mode);
3850 gk20a_writel(g, therm_gate_ctrl_r(engine), gate_ctrl);
3853 void gr_gk20a_init_elcg_mode(struct gk20a *g, u32 mode, u32 engine)
3855 u32 gate_ctrl, idle_filter;
3857 gate_ctrl = gk20a_readl(g, therm_gate_ctrl_r(engine));
3861 gate_ctrl = set_field(gate_ctrl,
3862 therm_gate_ctrl_eng_clk_m(),
3863 therm_gate_ctrl_eng_clk_run_f());
3864 gate_ctrl = set_field(gate_ctrl,
3865 therm_gate_ctrl_eng_pwr_m(),
3866 /* set elpg to auto to meet hw expectation */
3867 therm_gate_ctrl_eng_pwr_auto_f());
3870 gate_ctrl = set_field(gate_ctrl,
3871 therm_gate_ctrl_eng_clk_m(),
3872 therm_gate_ctrl_eng_clk_stop_f());
3875 gate_ctrl = set_field(gate_ctrl,
3876 therm_gate_ctrl_eng_clk_m(),
3877 therm_gate_ctrl_eng_clk_auto_f());
3880 gk20a_err(dev_from_gk20a(g),
3881 "invalid elcg mode %d", mode);
3884 if (tegra_platform_is_linsim()) {
3885 gate_ctrl = set_field(gate_ctrl,
3886 therm_gate_ctrl_eng_delay_after_m(),
3887 therm_gate_ctrl_eng_delay_after_f(4));
3890 /* 2 * (1 << 9) = 1024 clks */
3891 gate_ctrl = set_field(gate_ctrl,
3892 therm_gate_ctrl_eng_idle_filt_exp_m(),
3893 therm_gate_ctrl_eng_idle_filt_exp_f(9));
3894 gate_ctrl = set_field(gate_ctrl,
3895 therm_gate_ctrl_eng_idle_filt_mant_m(),
3896 therm_gate_ctrl_eng_idle_filt_mant_f(2));
3897 gk20a_writel(g, therm_gate_ctrl_r(engine), gate_ctrl);
3899 /* default fecs_idle_filter to 0 */
3900 idle_filter = gk20a_readl(g, therm_fecs_idle_filter_r());
3901 idle_filter &= ~therm_fecs_idle_filter_value_m();
3902 gk20a_writel(g, therm_fecs_idle_filter_r(), idle_filter);
3903 /* default hubmmu_idle_filter to 0 */
3904 idle_filter = gk20a_readl(g, therm_hubmmu_idle_filter_r());
3905 idle_filter &= ~therm_hubmmu_idle_filter_value_m();
3906 gk20a_writel(g, therm_hubmmu_idle_filter_r(), idle_filter);
3909 static int gr_gk20a_zcull_init_hw(struct gk20a *g, struct gr_gk20a *gr)
3911 u32 gpc_index, gpc_tpc_count, gpc_zcull_count;
3912 u32 *zcull_map_tiles, *zcull_bank_counters;
3916 bool floorsweep = false;
3921 zcull_map_tiles = kzalloc(proj_scal_max_gpcs_v() *
3922 proj_scal_max_tpc_per_gpc_v() * sizeof(u32), GFP_KERNEL);
3923 if (!zcull_map_tiles) {
3924 gk20a_err(dev_from_gk20a(g),
3925 "failed to allocate zcull temp buffers");
3928 zcull_bank_counters = kzalloc(proj_scal_max_gpcs_v() *
3929 proj_scal_max_tpc_per_gpc_v() * sizeof(u32), GFP_KERNEL);
3931 if (!zcull_bank_counters) {
3932 gk20a_err(dev_from_gk20a(g),
3933 "failed to allocate zcull temp buffers");
3934 kfree(zcull_map_tiles);
3938 for (map_counter = 0; map_counter < gr->tpc_count; map_counter++) {
3939 zcull_map_tiles[map_counter] =
3940 zcull_bank_counters[gr->map_tiles[map_counter]];
3941 zcull_bank_counters[gr->map_tiles[map_counter]]++;
3944 gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map0_r(),
3945 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_0_f(zcull_map_tiles[0]) |
3946 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_1_f(zcull_map_tiles[1]) |
3947 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_2_f(zcull_map_tiles[2]) |
3948 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_3_f(zcull_map_tiles[3]) |
3949 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_4_f(zcull_map_tiles[4]) |
3950 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_5_f(zcull_map_tiles[5]) |
3951 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_6_f(zcull_map_tiles[6]) |
3952 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_7_f(zcull_map_tiles[7]));
3954 gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map1_r(),
3955 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_8_f(zcull_map_tiles[8]) |
3956 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_9_f(zcull_map_tiles[9]) |
3957 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_10_f(zcull_map_tiles[10]) |
3958 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_11_f(zcull_map_tiles[11]) |
3959 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_12_f(zcull_map_tiles[12]) |
3960 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_13_f(zcull_map_tiles[13]) |
3961 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_14_f(zcull_map_tiles[14]) |
3962 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_15_f(zcull_map_tiles[15]));
3964 gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map2_r(),
3965 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_16_f(zcull_map_tiles[16]) |
3966 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_17_f(zcull_map_tiles[17]) |
3967 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_18_f(zcull_map_tiles[18]) |
3968 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_19_f(zcull_map_tiles[19]) |
3969 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_20_f(zcull_map_tiles[20]) |
3970 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_21_f(zcull_map_tiles[21]) |
3971 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_22_f(zcull_map_tiles[22]) |
3972 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_f(zcull_map_tiles[23]));
3974 gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map3_r(),
3975 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_24_f(zcull_map_tiles[24]) |
3976 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_25_f(zcull_map_tiles[25]) |
3977 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_26_f(zcull_map_tiles[26]) |
3978 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_27_f(zcull_map_tiles[27]) |
3979 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_28_f(zcull_map_tiles[28]) |
3980 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_29_f(zcull_map_tiles[29]) |
3981 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_30_f(zcull_map_tiles[30]) |
3982 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_31_f(zcull_map_tiles[31]));
3984 kfree(zcull_map_tiles);
3985 kfree(zcull_bank_counters);
3987 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3988 gpc_tpc_count = gr->gpc_tpc_count[gpc_index];
3989 gpc_zcull_count = gr->gpc_zcb_count[gpc_index];
3991 if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
3992 gpc_zcull_count < gpc_tpc_count) {
3993 gk20a_err(dev_from_gk20a(g),
3994 "zcull_banks (%d) less than tpcs (%d) for gpc (%d)",
3995 gpc_zcull_count, gpc_tpc_count, gpc_index);
3998 if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
3999 gpc_zcull_count != 0)
4003 /* 1.0f / 1.0f * gr_gpc0_zcull_sm_num_rcp_conservative__max_v() */
4004 rcp_conserv = gr_gpc0_zcull_sm_num_rcp_conservative__max_v();
4006 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4007 offset = gpc_index * proj_gpc_stride_v();
4010 gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
4011 gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
4012 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
4013 gr->max_zcull_per_gpc_count));
4015 gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
4016 gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
4017 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
4018 gr->gpc_tpc_count[gpc_index]));
4021 gk20a_writel(g, gr_gpc0_zcull_fs_r() + offset,
4022 gr_gpc0_zcull_fs_num_active_banks_f(gr->gpc_zcb_count[gpc_index]) |
4023 gr_gpc0_zcull_fs_num_sms_f(gr->tpc_count));
4025 gk20a_writel(g, gr_gpc0_zcull_sm_num_rcp_r() + offset,
4026 gr_gpc0_zcull_sm_num_rcp_conservative_f(rcp_conserv));
4029 gk20a_writel(g, gr_gpcs_ppcs_wwdx_sm_num_rcp_r(),
4030 gr_gpcs_ppcs_wwdx_sm_num_rcp_conservative_f(rcp_conserv));
4035 static void gk20a_gr_enable_gpc_exceptions(struct gk20a *g)
4037 /* enable tpc exception forwarding */
4038 gk20a_writel(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r(),
4039 gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f());
4041 /* enable gpc exception forwarding */
4042 gk20a_writel(g, gr_gpc0_gpccs_gpc_exception_en_r(),
4043 gr_gpc0_gpccs_gpc_exception_en_tpc_0_enabled_f());
4047 void gr_gk20a_enable_hww_exceptions(struct gk20a *g)
4049 /* enable exceptions */
4050 gk20a_writel(g, gr_fe_hww_esr_r(),
4051 gr_fe_hww_esr_en_enable_f() |
4052 gr_fe_hww_esr_reset_active_f());
4053 gk20a_writel(g, gr_memfmt_hww_esr_r(),
4054 gr_memfmt_hww_esr_en_enable_f() |
4055 gr_memfmt_hww_esr_reset_active_f());
4056 gk20a_writel(g, gr_scc_hww_esr_r(),
4057 gr_scc_hww_esr_en_enable_f() |
4058 gr_scc_hww_esr_reset_active_f());
4059 gk20a_writel(g, gr_mme_hww_esr_r(),
4060 gr_mme_hww_esr_en_enable_f() |
4061 gr_mme_hww_esr_reset_active_f());
4062 gk20a_writel(g, gr_pd_hww_esr_r(),
4063 gr_pd_hww_esr_en_enable_f() |
4064 gr_pd_hww_esr_reset_active_f());
4065 gk20a_writel(g, gr_sked_hww_esr_r(), /* enabled by default */
4066 gr_sked_hww_esr_reset_active_f());
4067 gk20a_writel(g, gr_ds_hww_esr_r(),
4068 gr_ds_hww_esr_en_enabled_f() |
4069 gr_ds_hww_esr_reset_task_f());
4070 gk20a_writel(g, gr_ds_hww_report_mask_r(),
4071 gr_ds_hww_report_mask_sph0_err_report_f() |
4072 gr_ds_hww_report_mask_sph1_err_report_f() |
4073 gr_ds_hww_report_mask_sph2_err_report_f() |
4074 gr_ds_hww_report_mask_sph3_err_report_f() |
4075 gr_ds_hww_report_mask_sph4_err_report_f() |
4076 gr_ds_hww_report_mask_sph5_err_report_f() |
4077 gr_ds_hww_report_mask_sph6_err_report_f() |
4078 gr_ds_hww_report_mask_sph7_err_report_f() |
4079 gr_ds_hww_report_mask_sph8_err_report_f() |
4080 gr_ds_hww_report_mask_sph9_err_report_f() |
4081 gr_ds_hww_report_mask_sph10_err_report_f() |
4082 gr_ds_hww_report_mask_sph11_err_report_f() |
4083 gr_ds_hww_report_mask_sph12_err_report_f() |
4084 gr_ds_hww_report_mask_sph13_err_report_f() |
4085 gr_ds_hww_report_mask_sph14_err_report_f() |
4086 gr_ds_hww_report_mask_sph15_err_report_f() |
4087 gr_ds_hww_report_mask_sph16_err_report_f() |
4088 gr_ds_hww_report_mask_sph17_err_report_f() |
4089 gr_ds_hww_report_mask_sph18_err_report_f() |
4090 gr_ds_hww_report_mask_sph19_err_report_f() |
4091 gr_ds_hww_report_mask_sph20_err_report_f() |
4092 gr_ds_hww_report_mask_sph21_err_report_f() |
4093 gr_ds_hww_report_mask_sph22_err_report_f() |
4094 gr_ds_hww_report_mask_sph23_err_report_f());
4097 static void gr_gk20a_set_hww_esr_report_mask(struct gk20a *g)
4099 /* setup sm warp esr report masks */
4100 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
4101 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f() |
4102 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f() |
4103 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f() |
4104 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f() |
4105 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f() |
4106 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f() |
4107 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f() |
4108 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f() |
4109 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() |
4110 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f() |
4111 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f() |
4112 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f() |
4113 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f() |
4114 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f() |
4115 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f() |
4116 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f() |
4117 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f() |
4118 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() |
4119 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f() |
4120 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f());
4122 /* setup sm global esr report mask */
4123 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
4124 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f() |
4125 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f() |
4126 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f() |
4127 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f() |
4128 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f() |
4129 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f() |
4130 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f());
4133 static int gk20a_init_gr_setup_hw(struct gk20a *g)
4135 struct gr_gk20a *gr = &g->gr;
4136 struct aiv_list_gk20a *sw_ctx_load = &g->gr.ctx_vars.sw_ctx_load;
4137 struct av_list_gk20a *sw_method_init = &g->gr.ctx_vars.sw_method_init;
4139 u32 addr_lo, addr_hi;
4141 unsigned long end_jiffies = jiffies +
4142 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
4143 u32 fe_go_idle_timeout_save;
4144 u32 last_method_data = 0;
4149 /* slcg prod values */
4150 g->ops.clock_gating.slcg_gr_load_gating_prod(g, g->slcg_enabled);
4151 g->ops.clock_gating.slcg_perf_load_gating_prod(g, g->slcg_enabled);
4153 /* init mmu debug buffer */
4154 addr = NV_MC_SMMU_VADDR_TRANSLATE(gr->mmu_wr_mem.iova);
4155 addr_lo = u64_lo32(addr);
4156 addr_hi = u64_hi32(addr);
4157 addr = (addr_lo >> fb_mmu_debug_wr_addr_alignment_v()) |
4158 (addr_hi << (32 - fb_mmu_debug_wr_addr_alignment_v()));
4160 gk20a_writel(g, fb_mmu_debug_wr_r(),
4161 fb_mmu_debug_wr_aperture_vid_mem_f() |
4162 fb_mmu_debug_wr_vol_false_f() |
4163 fb_mmu_debug_wr_addr_v(addr));
4165 addr = NV_MC_SMMU_VADDR_TRANSLATE(gr->mmu_rd_mem.iova);
4166 addr_lo = u64_lo32(addr);
4167 addr_hi = u64_hi32(addr);
4168 addr = (addr_lo >> fb_mmu_debug_rd_addr_alignment_v()) |
4169 (addr_hi << (32 - fb_mmu_debug_rd_addr_alignment_v()));
4171 gk20a_writel(g, fb_mmu_debug_rd_r(),
4172 fb_mmu_debug_rd_aperture_vid_mem_f() |
4173 fb_mmu_debug_rd_vol_false_f() |
4174 fb_mmu_debug_rd_addr_v(addr));
4176 /* load gr floorsweeping registers */
4177 data = gk20a_readl(g, gr_gpc0_ppc0_pes_vsc_strem_r());
4178 data = set_field(data, gr_gpc0_ppc0_pes_vsc_strem_master_pe_m(),
4179 gr_gpc0_ppc0_pes_vsc_strem_master_pe_true_f());
4180 gk20a_writel(g, gr_gpc0_ppc0_pes_vsc_strem_r(), data);
4182 gr_gk20a_zcull_init_hw(g, gr);
4184 g->ops.clock_gating.blcg_gr_load_gating_prod(g, g->blcg_enabled);
4185 g->ops.clock_gating.pg_gr_load_gating_prod(g, true);
4187 if (g->elcg_enabled) {
4188 gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_GR_GK20A);
4189 gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_CE2_GK20A);
4191 gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_GR_GK20A);
4192 gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_CE2_GK20A);
4195 /* Bug 1340570: increase the clock timeout to avoid potential
4196 * operation failure at high gpcclk rate. Default values are 0x400.
4198 gk20a_writel(g, pri_ringstation_sys_master_config_r(0x15), 0x800);
4199 gk20a_writel(g, pri_ringstation_gpc_master_config_r(0xa), 0x800);
4200 gk20a_writel(g, pri_ringstation_fbp_master_config_r(0x8), 0x800);
4202 /* enable fifo access */
4203 gk20a_writel(g, gr_gpfifo_ctl_r(),
4204 gr_gpfifo_ctl_access_enabled_f() |
4205 gr_gpfifo_ctl_semaphore_access_enabled_f());
4207 /* TBD: reload gr ucode when needed */
4209 /* enable interrupts */
4210 gk20a_writel(g, gr_intr_r(), 0xFFFFFFFF);
4211 gk20a_writel(g, gr_intr_en_r(), 0xFFFFFFFF);
4213 /* enable fecs error interrupts */
4214 gk20a_writel(g, gr_fecs_host_int_enable_r(),
4215 gr_fecs_host_int_enable_fault_during_ctxsw_enable_f() |
4216 gr_fecs_host_int_enable_umimp_firmware_method_enable_f() |
4217 gr_fecs_host_int_enable_umimp_illegal_method_enable_f() |
4218 gr_fecs_host_int_enable_watchdog_enable_f());
4220 g->ops.gr.enable_hww_exceptions(g);
4221 g->ops.gr.set_hww_esr_report_mask(g);
4223 /* enable per GPC exceptions */
4224 gk20a_gr_enable_gpc_exceptions(g);
4226 /* TBD: ECC for L1/SM */
4227 /* TBD: enable per BE exceptions */
4229 /* reset and enable all exceptions */
4230 gk20a_writel(g, gr_exception_r(), 0xFFFFFFFF);
4231 gk20a_writel(g, gr_exception_en_r(), 0xFFFFFFFF);
4232 gk20a_writel(g, gr_exception1_r(), 0xFFFFFFFF);
4233 gk20a_writel(g, gr_exception1_en_r(), 0xFFFFFFFF);
4234 gk20a_writel(g, gr_exception2_r(), 0xFFFFFFFF);
4235 gk20a_writel(g, gr_exception2_en_r(), 0xFFFFFFFF);
4237 /* ignore status from some units */
4238 data = gk20a_readl(g, gr_status_mask_r());
4239 gk20a_writel(g, gr_status_mask_r(), data & gr->status_disable_mask);
4242 gr_gk20a_load_zbc_table(g, gr);
4244 gr_gk20a_load_zbc_default_table(g, gr);
4246 g->ops.ltc.init_cbc(g, gr);
4249 for (i = 0; i < sw_ctx_load->count; i++)
4250 gk20a_writel(g, sw_ctx_load->l[i].addr,
4251 sw_ctx_load->l[i].value);
4253 err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4257 /* save and disable fe_go_idle */
4258 fe_go_idle_timeout_save =
4259 gk20a_readl(g, gr_fe_go_idle_timeout_r());
4260 gk20a_writel(g, gr_fe_go_idle_timeout_r(),
4261 (fe_go_idle_timeout_save & gr_fe_go_idle_timeout_count_f(0)) |
4262 gr_fe_go_idle_timeout_count_disabled_f());
4264 /* override a few ctx state registers */
4265 g->ops.gr.commit_global_cb_manager(g, NULL, false);
4266 gr_gk20a_commit_global_timeslice(g, NULL, false);
4268 /* floorsweep anything left */
4269 g->ops.gr.init_fs_state(g);
4271 err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4273 goto restore_fe_go_idle;
4276 /* restore fe_go_idle */
4277 gk20a_writel(g, gr_fe_go_idle_timeout_r(), fe_go_idle_timeout_save);
4279 if (err || gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT))
4282 /* load method init */
4283 if (sw_method_init->count) {
4284 gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
4285 sw_method_init->l[0].value);
4286 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
4287 gr_pri_mme_shadow_raw_index_write_trigger_f() |
4288 sw_method_init->l[0].addr);
4289 last_method_data = sw_method_init->l[0].value;
4291 for (i = 1; i < sw_method_init->count; i++) {
4292 if (sw_method_init->l[i].value != last_method_data) {
4293 gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
4294 sw_method_init->l[i].value);
4295 last_method_data = sw_method_init->l[i].value;
4297 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
4298 gr_pri_mme_shadow_raw_index_write_trigger_f() |
4299 sw_method_init->l[i].addr);
4302 err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4307 gk20a_dbg_fn("done");
4311 static int gk20a_init_gr_prepare(struct gk20a *g)
4313 u32 gpfifo_ctrl, pmc_en;
4316 /* disable fifo access */
4317 pmc_en = gk20a_readl(g, mc_enable_r());
4318 if (pmc_en & mc_enable_pgraph_enabled_f()) {
4319 gpfifo_ctrl = gk20a_readl(g, gr_gpfifo_ctl_r());
4320 gpfifo_ctrl &= ~gr_gpfifo_ctl_access_enabled_f();
4321 gk20a_writel(g, gr_gpfifo_ctl_r(), gpfifo_ctrl);
4324 /* reset gr engine */
4325 gk20a_reset(g, mc_enable_pgraph_enabled_f()
4326 | mc_enable_blg_enabled_f()
4327 | mc_enable_perfmon_enabled_f());
4329 /* enable fifo access */
4330 gk20a_writel(g, gr_gpfifo_ctl_r(),
4331 gr_gpfifo_ctl_access_enabled_f() |
4332 gr_gpfifo_ctl_semaphore_access_enabled_f());
4334 if (!g->gr.ctx_vars.valid) {
4335 err = gr_gk20a_init_ctx_vars(g, &g->gr);
4337 gk20a_err(dev_from_gk20a(g),
4338 "fail to load gr init ctx");
4343 static int gr_gk20a_wait_mem_scrubbing(struct gk20a *g)
4345 int retries = GR_IDLE_CHECK_MAX / GR_IDLE_CHECK_DEFAULT;
4346 bool fecs_scrubbing;
4347 bool gpccs_scrubbing;
4352 fecs_scrubbing = gk20a_readl(g, gr_fecs_dmactl_r()) &
4353 (gr_fecs_dmactl_imem_scrubbing_m() |
4354 gr_fecs_dmactl_dmem_scrubbing_m());
4356 gpccs_scrubbing = gk20a_readl(g, gr_gpccs_dmactl_r()) &
4357 (gr_gpccs_dmactl_imem_scrubbing_m() |
4358 gr_gpccs_dmactl_imem_scrubbing_m());
4360 if (!fecs_scrubbing && !gpccs_scrubbing) {
4361 gk20a_dbg_fn("done");
4365 udelay(GR_IDLE_CHECK_DEFAULT);
4366 } while (--retries || !tegra_platform_is_silicon());
4368 gk20a_err(dev_from_gk20a(g), "Falcon mem scrubbing timeout");
4372 static int gk20a_init_gr_reset_enable_hw(struct gk20a *g)
4374 struct gr_gk20a *gr = &g->gr;
4375 struct av_list_gk20a *sw_non_ctx_load = &g->gr.ctx_vars.sw_non_ctx_load;
4376 unsigned long end_jiffies = jiffies +
4377 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
4382 /* enable interrupts */
4383 gk20a_writel(g, gr_intr_r(), ~0);
4384 gk20a_writel(g, gr_intr_en_r(), ~0);
4386 /* reset ctx switch state */
4387 gr_gk20a_ctx_reset(g, 0);
4390 gk20a_writel(g, gr_scc_init_r(),
4391 gr_scc_init_ram_trigger_f());
4393 /* load non_ctx init */
4394 for (i = 0; i < sw_non_ctx_load->count; i++)
4395 gk20a_writel(g, sw_non_ctx_load->l[i].addr,
4396 sw_non_ctx_load->l[i].value);
4398 err = gr_gk20a_wait_mem_scrubbing(g);
4402 err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4406 err = gr_gk20a_load_ctxsw_ucode(g, gr);
4410 /* this appears query for sw states but fecs actually init
4411 ramchain, etc so this is hw init */
4412 err = gr_gk20a_init_ctx_state(g, gr);
4418 gk20a_err(dev_from_gk20a(g), "fail");
4420 gk20a_dbg_fn("done");
4426 * XXX Merge this list with the debugger/profiler
4427 * session regops whitelists?
4429 static u32 wl_addr_gk20a[] = {
4430 /* this list must be sorted (low to high) */
4431 0x404468, /* gr_pri_mme_max_instructions */
4432 0x408944, /* gr_pri_bes_crop_hww_esr */
4433 0x418800, /* gr_pri_gpcs_setup_debug */
4434 0x419a04, /* gr_pri_gpcs_tpcs_tex_lod_dbg */
4435 0x419a08, /* gr_pri_gpcs_tpcs_tex_samp_dbg */
4436 0x419e10, /* gr_pri_gpcs_tpcs_sm_dbgr_control0 */
4437 0x419f78, /* gr_pri_gpcs_tpcs_sm_disp_ctrl */
4440 static int gr_gk20a_init_access_map(struct gk20a *g)
4442 struct gr_gk20a *gr = &g->gr;
4446 DIV_ROUND_UP(gr->ctx_vars.priv_access_map_size,
4449 data = vmap(gr->global_ctx_buffer[PRIV_ACCESS_MAP].pages,
4450 PAGE_ALIGN(gr->global_ctx_buffer[PRIV_ACCESS_MAP].size) >>
4451 PAGE_SHIFT, 0, pgprot_dmacoherent(PAGE_KERNEL));
4453 gk20a_err(dev_from_gk20a(g),
4454 "failed to map priv access map memory");
4459 memset(data, 0x0, PAGE_SIZE * nr_pages);
4461 for (w = 0; w < ARRAY_SIZE(wl_addr_gk20a); w++) {
4462 u32 map_bit, map_byte, map_shift;
4463 map_bit = wl_addr_gk20a[w] >> 2;
4464 map_byte = map_bit >> 3;
4465 map_shift = map_bit & 0x7; /* i.e. 0-7 */
4466 gk20a_dbg_info("access map addr:0x%x byte:0x%x bit:%d",
4467 wl_addr_gk20a[w], map_byte, map_shift);
4468 ((u8 *)data)[map_byte] |= 1 << map_shift;
4477 static int gk20a_init_gr_setup_sw(struct gk20a *g)
4479 struct gr_gk20a *gr = &g->gr;
4485 gk20a_dbg_fn("skip init");
4491 err = gr_gk20a_init_gr_config(g, gr);
4495 err = gr_gk20a_init_mmu_sw(g, gr);
4499 err = gr_gk20a_init_map_tiles(g, gr);
4503 if (tegra_cpu_is_asim())
4504 gr->max_comptag_mem = 1; /* MBs worth of comptag coverage */
4506 gk20a_dbg_info("total ram pages : %lu", totalram_pages);
4507 gr->max_comptag_mem = totalram_pages
4508 >> (10 - (PAGE_SHIFT - 10));
4510 err = g->ops.ltc.init_comptags(g, gr);
4514 err = gr_gk20a_init_zcull(g, gr);
4518 err = gr_gk20a_alloc_global_ctx_buffers(g);
4522 err = gr_gk20a_init_access_map(g);
4526 mutex_init(&gr->ctx_mutex);
4527 spin_lock_init(&gr->ch_tlb_lock);
4529 gr->remove_support = gk20a_remove_gr_support;
4530 gr->sw_ready = true;
4532 gk20a_dbg_fn("done");
4536 gk20a_err(dev_from_gk20a(g), "fail");
4537 gk20a_remove_gr_support(gr);
4541 int gk20a_init_gr_support(struct gk20a *g)
4547 err = gk20a_init_gr_prepare(g);
4551 /* this is required before gr_gk20a_init_ctx_state */
4552 mutex_init(&g->gr.fecs_mutex);
4554 err = gk20a_init_gr_reset_enable_hw(g);
4558 err = gk20a_init_gr_setup_sw(g);
4562 err = gk20a_init_gr_setup_hw(g);
4566 /* GR is inialized, signal possible waiters */
4567 g->gr.initialized = true;
4568 wake_up(&g->gr.init_wq);
4573 /* Wait until GR is initialized */
4574 void gk20a_gr_wait_initialized(struct gk20a *g)
4576 wait_event(g->gr.init_wq, g->gr.initialized);
4579 #define NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE 0x02dc
4580 #define NVA297_SET_CIRCULAR_BUFFER_SIZE 0x1280
4581 #define NVA297_SET_SHADER_EXCEPTIONS 0x1528
4582 #define NVA0C0_SET_SHADER_EXCEPTIONS 0x1528
4584 #define NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE 0
4586 struct gr_isr_data {
4597 void gk20a_gr_set_shader_exceptions(struct gk20a *g, u32 data)
4601 if (data == NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE) {
4603 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(), 0);
4605 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(), 0);
4607 /* setup sm warp esr report masks */
4608 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
4609 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f() |
4610 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f() |
4611 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f() |
4612 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f() |
4613 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f() |
4614 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f() |
4615 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f() |
4616 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f() |
4617 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() |
4618 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f() |
4619 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f() |
4620 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f() |
4621 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f() |
4622 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f() |
4623 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f() |
4624 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f() |
4625 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f() |
4626 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() |
4627 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f() |
4628 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f());
4630 /* setup sm global esr report mask */
4631 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
4632 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f() |
4633 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f() |
4634 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f() |
4635 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f() |
4636 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f() |
4637 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f() |
4638 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f());
4642 static void gk20a_gr_set_circular_buffer_size(struct gk20a *g, u32 data)
4644 struct gr_gk20a *gr = &g->gr;
4645 u32 gpc_index, ppc_index, stride, val, offset;
4646 u32 cb_size = data * 4;
4650 if (cb_size > gr->attrib_cb_size)
4651 cb_size = gr->attrib_cb_size;
4653 gk20a_writel(g, gr_ds_tga_constraintlogic_r(),
4654 (gk20a_readl(g, gr_ds_tga_constraintlogic_r()) &
4655 ~gr_ds_tga_constraintlogic_beta_cbsize_f(~0)) |
4656 gr_ds_tga_constraintlogic_beta_cbsize_f(cb_size));
4658 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4659 stride = proj_gpc_stride_v() * gpc_index;
4661 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
4664 val = gk20a_readl(g, gr_gpc0_ppc0_cbm_cfg_r() +
4666 proj_ppc_in_gpc_stride_v() * ppc_index);
4668 offset = gr_gpc0_ppc0_cbm_cfg_start_offset_v(val);
4670 val = set_field(val,
4671 gr_gpc0_ppc0_cbm_cfg_size_m(),
4672 gr_gpc0_ppc0_cbm_cfg_size_f(cb_size *
4673 gr->pes_tpc_count[ppc_index][gpc_index]));
4674 val = set_field(val,
4675 gr_gpc0_ppc0_cbm_cfg_start_offset_m(),
4678 gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg_r() +
4680 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4682 val = set_field(val,
4683 gr_gpc0_ppc0_cbm_cfg_start_offset_m(),
4686 gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg_r() +
4688 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4693 static void gk20a_gr_set_alpha_circular_buffer_size(struct gk20a *g, u32 data)
4695 struct gr_gk20a *gr = &g->gr;
4696 u32 gpc_index, ppc_index, stride, val;
4697 u32 pd_ab_max_output;
4698 u32 alpha_cb_size = data * 4;
4701 /* if (NO_ALPHA_BETA_TIMESLICE_SUPPORT_DEF)
4704 if (alpha_cb_size > gr->alpha_cb_size)
4705 alpha_cb_size = gr->alpha_cb_size;
4707 gk20a_writel(g, gr_ds_tga_constraintlogic_r(),
4708 (gk20a_readl(g, gr_ds_tga_constraintlogic_r()) &
4709 ~gr_ds_tga_constraintlogic_alpha_cbsize_f(~0)) |
4710 gr_ds_tga_constraintlogic_alpha_cbsize_f(alpha_cb_size));
4712 pd_ab_max_output = alpha_cb_size *
4713 gr_gpc0_ppc0_cbm_cfg_size_granularity_v() /
4714 gr_pd_ab_dist_cfg1_max_output_granularity_v();
4716 gk20a_writel(g, gr_pd_ab_dist_cfg1_r(),
4717 gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output));
4719 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4720 stride = proj_gpc_stride_v() * gpc_index;
4722 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
4725 val = gk20a_readl(g, gr_gpc0_ppc0_cbm_cfg2_r() +
4727 proj_ppc_in_gpc_stride_v() * ppc_index);
4729 val = set_field(val, gr_gpc0_ppc0_cbm_cfg2_size_m(),
4730 gr_gpc0_ppc0_cbm_cfg2_size_f(alpha_cb_size *
4731 gr->pes_tpc_count[ppc_index][gpc_index]));
4733 gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg2_r() +
4735 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4740 int gk20a_gr_reset(struct gk20a *g)
4745 err = gk20a_init_gr_prepare(g);
4749 err = gk20a_init_gr_reset_enable_hw(g);
4753 err = gk20a_init_gr_setup_hw(g);
4758 err = gr_gk20a_fecs_get_reglist_img_size(g, &size);
4760 gk20a_err(dev_from_gk20a(g),
4761 "fail to query fecs pg buffer size");
4765 err = gr_gk20a_fecs_set_reglist_bind_inst(g,
4766 g->mm.pmu.inst_block.cpu_pa);
4768 gk20a_err(dev_from_gk20a(g),
4769 "fail to bind pmu inst to gr");
4773 err = gr_gk20a_fecs_set_reglist_virtual_addr(g, g->pmu.pg_buf.pmu_va);
4775 gk20a_err(dev_from_gk20a(g),
4776 "fail to set pg buffer pmu va");
4783 static int gr_gk20a_handle_sw_method(struct gk20a *g, u32 addr,
4784 u32 class_num, u32 offset, u32 data)
4788 if (class_num == KEPLER_COMPUTE_A) {
4789 switch (offset << 2) {
4790 case NVA0C0_SET_SHADER_EXCEPTIONS:
4791 gk20a_gr_set_shader_exceptions(g, data);
4798 if (class_num == KEPLER_C) {
4799 switch (offset << 2) {
4800 case NVA297_SET_SHADER_EXCEPTIONS:
4801 gk20a_gr_set_shader_exceptions(g, data);
4803 case NVA297_SET_CIRCULAR_BUFFER_SIZE:
4804 g->ops.gr.set_circular_buffer_size(g, data);
4806 case NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE:
4807 g->ops.gr.set_alpha_circular_buffer_size(g, data);
4819 static int gk20a_gr_handle_semaphore_timeout_pending(struct gk20a *g,
4820 struct gr_isr_data *isr_data)
4822 struct fifo_gk20a *f = &g->fifo;
4823 struct channel_gk20a *ch = &f->channel[isr_data->chid];
4825 gk20a_set_error_notifier(ch,
4826 NVHOST_CHANNEL_GR_SEMAPHORE_TIMEOUT);
4827 gk20a_err(dev_from_gk20a(g),
4828 "gr semaphore timeout\n");
4832 static int gk20a_gr_intr_illegal_notify_pending(struct gk20a *g,
4833 struct gr_isr_data *isr_data)
4835 struct fifo_gk20a *f = &g->fifo;
4836 struct channel_gk20a *ch = &f->channel[isr_data->chid];
4838 gk20a_set_error_notifier(ch,
4839 NVHOST_CHANNEL_GR_ILLEGAL_NOTIFY);
4840 /* This is an unrecoverable error, reset is needed */
4841 gk20a_err(dev_from_gk20a(g),
4842 "gr semaphore timeout\n");
4846 static int gk20a_gr_handle_illegal_method(struct gk20a *g,
4847 struct gr_isr_data *isr_data)
4849 int ret = g->ops.gr.handle_sw_method(g, isr_data->addr,
4850 isr_data->class_num, isr_data->offset,
4853 gk20a_err(dev_from_gk20a(g), "invalid method class 0x%08x"
4854 ", offset 0x%08x address 0x%08x\n",
4855 isr_data->class_num, isr_data->offset, isr_data->addr);
4860 static int gk20a_gr_handle_illegal_class(struct gk20a *g,
4861 struct gr_isr_data *isr_data)
4863 struct fifo_gk20a *f = &g->fifo;
4864 struct channel_gk20a *ch = &f->channel[isr_data->chid];
4866 gk20a_set_error_notifier(ch,
4867 NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
4868 gk20a_err(dev_from_gk20a(g),
4869 "invalid class 0x%08x, offset 0x%08x",
4870 isr_data->class_num, isr_data->offset);
4874 static int gk20a_gr_handle_fecs_error(struct gk20a *g,
4875 struct gr_isr_data *isr_data)
4877 struct fifo_gk20a *f = &g->fifo;
4878 struct channel_gk20a *ch = &f->channel[isr_data->chid];
4879 u32 gr_fecs_intr = gk20a_readl(g, gr_fecs_intr_r());
4882 gk20a_err(dev_from_gk20a(g),
4883 "unhandled fecs error interrupt 0x%08x for channel %u",
4884 gr_fecs_intr, ch->hw_chid);
4886 gk20a_writel(g, gr_fecs_intr_r(), gr_fecs_intr);
4890 static int gk20a_gr_handle_class_error(struct gk20a *g,
4891 struct gr_isr_data *isr_data)
4893 struct fifo_gk20a *f = &g->fifo;
4894 struct channel_gk20a *ch = &f->channel[isr_data->chid];
4895 u32 gr_class_error =
4896 gr_class_error_code_v(gk20a_readl(g, gr_class_error_r()));
4899 gk20a_set_error_notifier(ch,
4900 NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
4901 gk20a_err(dev_from_gk20a(g),
4902 "class error 0x%08x, offset 0x%08x, unhandled intr 0x%08x for channel %u\n",
4903 isr_data->class_num, isr_data->offset,
4904 gr_class_error, ch->hw_chid);
4908 static int gk20a_gr_handle_semaphore_pending(struct gk20a *g,
4909 struct gr_isr_data *isr_data)
4911 struct fifo_gk20a *f = &g->fifo;
4912 struct channel_gk20a *ch = &f->channel[isr_data->chid];
4914 wake_up(&ch->semaphore_wq);
4919 #if defined(CONFIG_GK20A_CYCLE_STATS)
4920 static inline bool is_valid_cyclestats_bar0_offset_gk20a(struct gk20a *g,
4923 /* support only 24-bit 4-byte aligned offsets */
4924 bool valid = !(offset & 0xFF000003);
4925 /* whitelist check */
4927 is_bar0_global_offset_whitelisted_gk20a(offset);
4928 /* resource size check in case there was a problem
4929 * with allocating the assumed size of bar0 */
4931 offset < resource_size(g->reg_mem);
4936 static int gk20a_gr_handle_notify_pending(struct gk20a *g,
4937 struct gr_isr_data *isr_data)
4939 struct fifo_gk20a *f = &g->fifo;
4940 struct channel_gk20a *ch = &f->channel[isr_data->chid];
4942 #if defined(CONFIG_GK20A_CYCLE_STATS)
4943 void *virtual_address;
4948 struct share_buffer_head *sh_hdr;
4952 struct gk20a_cyclestate_buffer_elem *op_elem;
4953 /* GL will never use payload 0 for cycle state */
4954 if ((ch->cyclestate.cyclestate_buffer == NULL) || (isr_data->data_lo == 0))
4957 mutex_lock(&ch->cyclestate.cyclestate_buffer_mutex);
4959 virtual_address = ch->cyclestate.cyclestate_buffer;
4960 buffer_size = ch->cyclestate.cyclestate_buffer_size;
4961 offset = isr_data->data_lo;
4964 if (offset >= buffer_size) {
4969 sh_hdr = (struct share_buffer_head *)
4970 ((char *)virtual_address + offset);
4972 if (sh_hdr->size < sizeof(struct share_buffer_head)) {
4976 new_offset = offset + sh_hdr->size;
4978 switch (sh_hdr->operation) {
4988 (struct gk20a_cyclestate_buffer_elem *)
4990 valid = is_valid_cyclestats_bar0_offset_gk20a(g,
4991 op_elem->offset_bar0);
4993 gk20a_err(dev_from_gk20a(g),
4994 "invalid cycletstats op offset: 0x%x\n",
4995 op_elem->offset_bar0);
4997 sh_hdr->failed = exit = true;
5004 (op_elem->last_bit + 1))
5006 op_elem->first_bit)-1);
5010 op_elem->offset_bar0);
5012 switch (sh_hdr->operation) {
5015 (raw_reg & mask_orig)
5016 >> op_elem->first_bit;
5021 if ((unsigned int)mask_orig !=
5024 (raw_reg & ~mask_orig);
5027 v |= ((op_elem->data
5028 << op_elem->first_bit)
5032 op_elem->offset_bar0,
5043 /* no operation content case */
5047 sh_hdr->completed = true;
5048 offset = new_offset;
5050 mutex_unlock(&ch->cyclestate.cyclestate_buffer_mutex);
5053 wake_up(&ch->notifier_wq);
5057 /* Used by sw interrupt thread to translate current ctx to chid.
5058 * For performance, we don't want to go through 128 channels every time.
5059 * curr_ctx should be the value read from gr_fecs_current_ctx_r().
5060 * A small tlb is used here to cache translation */
5061 static int gk20a_gr_get_chid_from_ctx(struct gk20a *g, u32 curr_ctx)
5063 struct fifo_gk20a *f = &g->fifo;
5064 struct gr_gk20a *gr = &g->gr;
5068 /* when contexts are unloaded from GR, the valid bit is reset
5069 * but the instance pointer information remains intact. So the
5070 * valid bit must be checked to be absolutely certain that a
5071 * valid context is currently resident. */
5072 if (!gr_fecs_current_ctx_valid_v(curr_ctx))
5075 spin_lock(&gr->ch_tlb_lock);
5077 /* check cache first */
5078 for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
5079 if (gr->chid_tlb[i].curr_ctx == curr_ctx) {
5080 chid = gr->chid_tlb[i].hw_chid;
5086 for (chid = 0; chid < f->num_channels; chid++)
5087 if (f->channel[chid].in_use) {
5088 if ((u32)(f->channel[chid].inst_block.cpu_pa >>
5089 ram_in_base_shift_v()) ==
5090 gr_fecs_current_ctx_ptr_v(curr_ctx))
5094 if (chid >= f->num_channels) {
5099 /* add to free tlb entry */
5100 for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
5101 if (gr->chid_tlb[i].curr_ctx == 0) {
5102 gr->chid_tlb[i].curr_ctx = curr_ctx;
5103 gr->chid_tlb[i].hw_chid = chid;
5108 /* no free entry, flush one */
5109 gr->chid_tlb[gr->channel_tlb_flush_index].curr_ctx = curr_ctx;
5110 gr->chid_tlb[gr->channel_tlb_flush_index].hw_chid = chid;
5112 gr->channel_tlb_flush_index =
5113 (gr->channel_tlb_flush_index + 1) &
5114 (GR_CHANNEL_MAP_TLB_SIZE - 1);
5117 spin_unlock(&gr->ch_tlb_lock);
5121 static int gk20a_gr_lock_down_sm(struct gk20a *g, u32 global_esr_mask)
5123 unsigned long end_jiffies = jiffies +
5124 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
5125 u32 delay = GR_IDLE_CHECK_DEFAULT;
5126 bool mmu_debug_mode_enabled = gk20a_mm_mmu_debug_mode_enabled(g);
5129 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "locking down SM");
5131 /* assert stop trigger */
5132 dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
5133 dbgr_control0 |= gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
5134 gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r(), dbgr_control0);
5136 /* wait for the sm to lock down */
5138 u32 global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
5139 u32 warp_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_warp_esr_r());
5140 u32 dbgr_status0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_status0_r());
5142 (gr_gpc0_tpc0_sm_dbgr_status0_locked_down_v(dbgr_status0) ==
5143 gr_gpc0_tpc0_sm_dbgr_status0_locked_down_true_v());
5144 bool error_pending =
5145 (gr_gpc0_tpc0_sm_hww_warp_esr_error_v(warp_esr) !=
5146 gr_gpc0_tpc0_sm_hww_warp_esr_error_none_v()) ||
5147 ((global_esr & ~global_esr_mask) != 0);
5149 if (locked_down || !error_pending) {
5150 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "locked down SM");
5152 /* de-assert stop trigger */
5153 dbgr_control0 &= ~gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
5154 gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r(), dbgr_control0);
5159 /* if an mmu fault is pending and mmu debug mode is not
5160 * enabled, the sm will never lock down. */
5161 if (!mmu_debug_mode_enabled && gk20a_fifo_mmu_fault_pending(g)) {
5162 gk20a_err(dev_from_gk20a(g), "mmu fault pending, sm will"
5163 " never lock down!");
5167 usleep_range(delay, delay * 2);
5168 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
5170 } while (time_before(jiffies, end_jiffies)
5171 || !tegra_platform_is_silicon());
5173 gk20a_err(dev_from_gk20a(g), "timed out while trying to lock down SM");
5178 bool gk20a_gr_sm_debugger_attached(struct gk20a *g)
5180 u32 dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
5182 /* check if an sm debugger is attached */
5183 if (gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_v(dbgr_control0) ==
5184 gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_v())
5190 static void gk20a_gr_clear_sm_hww(struct gk20a *g, u32 global_esr)
5192 gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r(), global_esr);
5194 /* clear the warp hww */
5195 gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r(),
5196 gr_gpc0_tpc0_sm_hww_warp_esr_error_none_f());
5199 static struct channel_gk20a *
5200 channel_from_hw_chid(struct gk20a *g, u32 hw_chid)
5202 return g->fifo.channel+hw_chid;
5205 static int gk20a_gr_handle_sm_exception(struct gk20a *g,
5206 struct gr_isr_data *isr_data)
5209 bool do_warp_sync = false;
5210 /* these three interrupts don't require locking down the SM. They can
5211 * be handled by usermode clients as they aren't fatal. Additionally,
5212 * usermode clients may wish to allow some warps to execute while others
5213 * are at breakpoints, as opposed to fatal errors where all warps should
5215 u32 global_mask = gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f() |
5216 gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f() |
5217 gr_gpc0_tpc0_sm_hww_global_esr_single_step_complete_pending_f();
5218 u32 global_esr, warp_esr;
5219 bool sm_debugger_attached = gk20a_gr_sm_debugger_attached(g);
5220 struct channel_gk20a *fault_ch;
5222 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
5224 global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
5225 warp_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_warp_esr_r());
5227 /* if an sm debugger is attached, disable forwarding of tpc exceptions.
5228 * the debugger will reenable exceptions after servicing them. */
5229 if (sm_debugger_attached) {
5230 u32 tpc_exception_en = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r());
5231 tpc_exception_en &= ~gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f();
5232 gk20a_writel(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r(), tpc_exception_en);
5233 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "SM debugger attached");
5236 /* if a debugger is present and an error has occurred, do a warp sync */
5237 if (sm_debugger_attached && ((warp_esr != 0) || ((global_esr & ~global_mask) != 0))) {
5238 gk20a_dbg(gpu_dbg_intr, "warp sync needed");
5239 do_warp_sync = true;
5243 ret = gk20a_gr_lock_down_sm(g, global_mask);
5245 gk20a_err(dev_from_gk20a(g), "sm did not lock down!\n");
5250 /* finally, signal any client waiting on an event */
5251 fault_ch = channel_from_hw_chid(g, isr_data->chid);
5253 gk20a_dbg_gpu_post_events(fault_ch);
5258 static int gk20a_gr_handle_tpc_exception(struct gk20a *g,
5259 struct gr_isr_data *isr_data)
5262 u32 tpc_exception = gk20a_readl(g, gr_gpcs_tpcs_tpccs_tpc_exception_r());
5264 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "");
5266 /* check if an sm exeption is pending */
5267 if (gr_gpcs_tpcs_tpccs_tpc_exception_sm_v(tpc_exception) ==
5268 gr_gpcs_tpcs_tpccs_tpc_exception_sm_pending_v()) {
5269 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "SM exception pending");
5270 ret = gk20a_gr_handle_sm_exception(g, isr_data);
5276 static int gk20a_gr_handle_gpc_exception(struct gk20a *g,
5277 struct gr_isr_data *isr_data)
5280 u32 gpc_exception = gk20a_readl(g, gr_gpcs_gpccs_gpc_exception_r());
5282 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "");
5284 /* check if tpc 0 has an exception */
5285 if (gr_gpcs_gpccs_gpc_exception_tpc_v(gpc_exception) ==
5286 gr_gpcs_gpccs_gpc_exception_tpc_0_pending_v()) {
5287 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "TPC exception pending");
5288 ret = gk20a_gr_handle_tpc_exception(g, isr_data);
5294 int gk20a_gr_isr(struct gk20a *g)
5296 struct gr_isr_data isr_data;
5300 u32 gr_intr = gk20a_readl(g, gr_intr_r());
5303 gk20a_dbg(gpu_dbg_intr, "pgraph intr %08x", gr_intr);
5308 grfifo_ctl = gk20a_readl(g, gr_gpfifo_ctl_r());
5309 grfifo_ctl &= ~gr_gpfifo_ctl_semaphore_access_f(1);
5310 grfifo_ctl &= ~gr_gpfifo_ctl_access_f(1);
5312 gk20a_writel(g, gr_gpfifo_ctl_r(),
5313 grfifo_ctl | gr_gpfifo_ctl_access_f(0) |
5314 gr_gpfifo_ctl_semaphore_access_f(0));
5316 isr_data.addr = gk20a_readl(g, gr_trapped_addr_r());
5317 isr_data.data_lo = gk20a_readl(g, gr_trapped_data_lo_r());
5318 isr_data.data_hi = gk20a_readl(g, gr_trapped_data_hi_r());
5319 isr_data.curr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
5320 isr_data.offset = gr_trapped_addr_mthd_v(isr_data.addr);
5321 isr_data.sub_chan = gr_trapped_addr_subch_v(isr_data.addr);
5322 obj_table = gk20a_readl(g,
5323 gr_fe_object_table_r(isr_data.sub_chan));
5324 isr_data.class_num = gr_fe_object_table_nvclass_v(obj_table);
5327 gk20a_gr_get_chid_from_ctx(g, isr_data.curr_ctx);
5328 if (isr_data.chid == -1) {
5329 gk20a_err(dev_from_gk20a(g), "invalid channel ctx 0x%08x",
5334 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5335 "channel %d: addr 0x%08x, "
5336 "data 0x%08x 0x%08x,"
5337 "ctx 0x%08x, offset 0x%08x, "
5338 "subchannel 0x%08x, class 0x%08x",
5339 isr_data.chid, isr_data.addr,
5340 isr_data.data_hi, isr_data.data_lo,
5341 isr_data.curr_ctx, isr_data.offset,
5342 isr_data.sub_chan, isr_data.class_num);
5344 if (gr_intr & gr_intr_notify_pending_f()) {
5345 gk20a_gr_handle_notify_pending(g, &isr_data);
5346 gk20a_writel(g, gr_intr_r(),
5347 gr_intr_notify_reset_f());
5348 gr_intr &= ~gr_intr_notify_pending_f();
5351 if (gr_intr & gr_intr_semaphore_pending_f()) {
5352 gk20a_gr_handle_semaphore_pending(g, &isr_data);
5353 gk20a_writel(g, gr_intr_r(),
5354 gr_intr_semaphore_reset_f());
5355 gr_intr &= ~gr_intr_semaphore_pending_f();
5358 if (gr_intr & gr_intr_semaphore_timeout_pending_f()) {
5359 need_reset |= gk20a_gr_handle_semaphore_timeout_pending(g,
5361 gk20a_writel(g, gr_intr_r(),
5362 gr_intr_semaphore_reset_f());
5363 gr_intr &= ~gr_intr_semaphore_pending_f();
5366 if (gr_intr & gr_intr_illegal_notify_pending_f()) {
5367 need_reset |= gk20a_gr_intr_illegal_notify_pending(g,
5369 gk20a_writel(g, gr_intr_r(),
5370 gr_intr_illegal_notify_reset_f());
5371 gr_intr &= ~gr_intr_illegal_notify_pending_f();
5374 if (gr_intr & gr_intr_illegal_method_pending_f()) {
5375 need_reset |= gk20a_gr_handle_illegal_method(g, &isr_data);
5376 gk20a_writel(g, gr_intr_r(),
5377 gr_intr_illegal_method_reset_f());
5378 gr_intr &= ~gr_intr_illegal_method_pending_f();
5381 if (gr_intr & gr_intr_illegal_class_pending_f()) {
5382 need_reset |= gk20a_gr_handle_illegal_class(g, &isr_data);
5383 gk20a_writel(g, gr_intr_r(),
5384 gr_intr_illegal_class_reset_f());
5385 gr_intr &= ~gr_intr_illegal_class_pending_f();
5388 if (gr_intr & gr_intr_fecs_error_pending_f()) {
5389 need_reset |= gk20a_gr_handle_fecs_error(g, &isr_data);
5390 gk20a_writel(g, gr_intr_r(),
5391 gr_intr_fecs_error_reset_f());
5392 gr_intr &= ~gr_intr_fecs_error_pending_f();
5395 if (gr_intr & gr_intr_class_error_pending_f()) {
5396 need_reset |= gk20a_gr_handle_class_error(g, &isr_data);
5397 gk20a_writel(g, gr_intr_r(),
5398 gr_intr_class_error_reset_f());
5399 gr_intr &= ~gr_intr_class_error_pending_f();
5402 /* this one happens if someone tries to hit a non-whitelisted
5403 * register using set_falcon[4] */
5404 if (gr_intr & gr_intr_firmware_method_pending_f()) {
5406 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "firmware method intr pending\n");
5407 gk20a_writel(g, gr_intr_r(),
5408 gr_intr_firmware_method_reset_f());
5409 gr_intr &= ~gr_intr_firmware_method_pending_f();
5412 if (gr_intr & gr_intr_exception_pending_f()) {
5413 u32 exception = gk20a_readl(g, gr_exception_r());
5414 struct fifo_gk20a *f = &g->fifo;
5415 struct channel_gk20a *ch = &f->channel[isr_data.chid];
5417 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "exception %08x\n", exception);
5419 if (exception & gr_exception_fe_m()) {
5420 u32 fe = gk20a_readl(g, gr_fe_hww_esr_r());
5421 gk20a_dbg(gpu_dbg_intr, "fe warning %08x\n", fe);
5422 gk20a_writel(g, gr_fe_hww_esr_r(), fe);
5423 need_reset |= -EFAULT;
5426 /* check if a gpc exception has occurred */
5427 if (exception & gr_exception_gpc_m() && need_reset == 0) {
5428 u32 exception1 = gk20a_readl(g, gr_exception1_r());
5429 u32 global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
5431 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "GPC exception pending");
5433 /* if no sm debugger is present, clean up the channel */
5434 if (!gk20a_gr_sm_debugger_attached(g)) {
5435 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5436 "SM debugger not attached, clearing interrupt");
5437 need_reset |= -EFAULT;
5439 /* check if gpc 0 has an exception */
5440 if (exception1 & gr_exception1_gpc_0_pending_f())
5441 need_reset |= gk20a_gr_handle_gpc_exception(g, &isr_data);
5442 /* clear the hwws, also causes tpc and gpc
5443 * exceptions to be cleared */
5444 gk20a_gr_clear_sm_hww(g, global_esr);
5448 gk20a_set_error_notifier(ch,
5449 NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
5452 gk20a_writel(g, gr_intr_r(), gr_intr_exception_reset_f());
5453 gr_intr &= ~gr_intr_exception_pending_f();
5457 gk20a_fifo_recover(g, BIT(ENGINE_GR_GK20A), true);
5460 gk20a_writel(g, gr_gpfifo_ctl_r(),
5461 grfifo_ctl | gr_gpfifo_ctl_access_f(1) |
5462 gr_gpfifo_ctl_semaphore_access_f(1));
5465 gk20a_err(dev_from_gk20a(g),
5466 "unhandled gr interrupt 0x%08x", gr_intr);
5471 int gk20a_gr_nonstall_isr(struct gk20a *g)
5473 u32 gr_intr = gk20a_readl(g, gr_intr_nonstall_r());
5476 gk20a_dbg(gpu_dbg_intr, "pgraph nonstall intr %08x", gr_intr);
5478 if (gr_intr & gr_intr_nonstall_trap_pending_f()) {
5479 gk20a_channel_semaphore_wakeup(g);
5480 clear_intr |= gr_intr_nonstall_trap_pending_f();
5483 gk20a_writel(g, gr_intr_nonstall_r(), clear_intr);
5488 int gr_gk20a_fecs_get_reglist_img_size(struct gk20a *g, u32 *size)
5490 BUG_ON(size == NULL);
5491 return gr_gk20a_submit_fecs_method_op(g,
5492 (struct fecs_method_op_gk20a) {
5497 .method.addr = gr_fecs_method_push_adr_discover_reglist_image_size_v(),
5498 .mailbox.ret = size,
5499 .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
5501 .cond.fail = GR_IS_UCODE_OP_SKIP,
5502 .mailbox.fail = 0});
5505 int gr_gk20a_fecs_set_reglist_bind_inst(struct gk20a *g, phys_addr_t addr)
5507 return gr_gk20a_submit_fecs_method_op(g,
5508 (struct fecs_method_op_gk20a){
5510 .mailbox.data = (gr_fecs_current_ctx_ptr_f(addr >> 12) |
5511 gr_fecs_current_ctx_valid_f(1) |
5512 gr_fecs_current_ctx_target_vid_mem_f()),
5515 .method.addr = gr_fecs_method_push_adr_set_reglist_bind_instance_v(),
5516 .mailbox.ret = NULL,
5517 .cond.ok = GR_IS_UCODE_OP_EQUAL,
5519 .cond.fail = GR_IS_UCODE_OP_SKIP,
5520 .mailbox.fail = 0});
5523 int gr_gk20a_fecs_set_reglist_virtual_addr(struct gk20a *g, u64 pmu_va)
5525 return gr_gk20a_submit_fecs_method_op(g,
5526 (struct fecs_method_op_gk20a) {
5528 .mailbox.data = u64_lo32(pmu_va >> 8),
5531 .method.addr = gr_fecs_method_push_adr_set_reglist_virtual_address_v(),
5532 .mailbox.ret = NULL,
5533 .cond.ok = GR_IS_UCODE_OP_EQUAL,
5535 .cond.fail = GR_IS_UCODE_OP_SKIP,
5536 .mailbox.fail = 0});
5539 int gk20a_gr_suspend(struct gk20a *g)
5541 unsigned long end_jiffies = jiffies +
5542 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
5547 ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
5551 gk20a_writel(g, gr_gpfifo_ctl_r(),
5552 gr_gpfifo_ctl_access_disabled_f());
5554 /* disable gr intr */
5555 gk20a_writel(g, gr_intr_r(), 0);
5556 gk20a_writel(g, gr_intr_en_r(), 0);
5558 /* disable all exceptions */
5559 gk20a_writel(g, gr_exception_r(), 0);
5560 gk20a_writel(g, gr_exception_en_r(), 0);
5561 gk20a_writel(g, gr_exception1_r(), 0);
5562 gk20a_writel(g, gr_exception1_en_r(), 0);
5563 gk20a_writel(g, gr_exception2_r(), 0);
5564 gk20a_writel(g, gr_exception2_en_r(), 0);
5566 gk20a_gr_flush_channel_tlb(&g->gr);
5568 g->gr.initialized = false;
5570 gk20a_dbg_fn("done");
5574 static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
5576 bool is_quad, u32 quad,
5577 u32 *context_buffer,
5578 u32 context_buffer_size,
5581 /* This function will decode a priv address and return the partition type and numbers. */
5582 int gr_gk20a_decode_priv_addr(struct gk20a *g, u32 addr,
5583 int *addr_type, /* enum ctxsw_addr_type */
5584 u32 *gpc_num, u32 *tpc_num, u32 *ppc_num, u32 *be_num,
5585 u32 *broadcast_flags)
5589 u32 ppc_broadcast_addr;
5591 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
5593 /* setup defaults */
5595 ppc_broadcast_addr = 0;
5596 *addr_type = CTXSW_ADDR_TYPE_SYS;
5597 *broadcast_flags = PRI_BROADCAST_FLAGS_NONE;
5603 if (pri_is_gpc_addr(addr)) {
5604 *addr_type = CTXSW_ADDR_TYPE_GPC;
5605 gpc_addr = pri_gpccs_addr_mask(addr);
5606 if (pri_is_gpc_addr_shared(addr)) {
5607 *addr_type = CTXSW_ADDR_TYPE_GPC;
5608 *broadcast_flags |= PRI_BROADCAST_FLAGS_GPC;
5610 *gpc_num = pri_get_gpc_num(addr);
5612 if (pri_is_tpc_addr(gpc_addr)) {
5613 *addr_type = CTXSW_ADDR_TYPE_TPC;
5614 if (pri_is_tpc_addr_shared(gpc_addr)) {
5615 *broadcast_flags |= PRI_BROADCAST_FLAGS_TPC;
5618 *tpc_num = pri_get_tpc_num(gpc_addr);
5621 } else if (pri_is_be_addr(addr)) {
5622 *addr_type = CTXSW_ADDR_TYPE_BE;
5623 if (pri_is_be_addr_shared(addr)) {
5624 *broadcast_flags |= PRI_BROADCAST_FLAGS_BE;
5627 *be_num = pri_get_be_num(addr);
5630 *addr_type = CTXSW_ADDR_TYPE_SYS;
5639 static int gr_gk20a_split_ppc_broadcast_addr(struct gk20a *g, u32 addr,
5641 u32 *priv_addr_table, u32 *t)
5645 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
5647 for (ppc_num = 0; ppc_num < g->gr.pe_count_per_gpc; ppc_num++)
5648 priv_addr_table[(*t)++] = pri_ppc_addr(pri_ppccs_addr_mask(addr),
5655 * The context buffer is indexed using BE broadcast addresses and GPC/TPC
5656 * unicast addresses. This function will convert a BE unicast address to a BE
5657 * broadcast address and split a GPC/TPC broadcast address into a table of
5658 * GPC/TPC addresses. The addresses generated by this function can be
5659 * successfully processed by gr_gk20a_find_priv_offset_in_buffer
5661 static int gr_gk20a_create_priv_addr_table(struct gk20a *g,
5663 u32 *priv_addr_table,
5666 int addr_type; /*enum ctxsw_addr_type */
5667 u32 gpc_num, tpc_num, ppc_num, be_num;
5668 u32 broadcast_flags;
5675 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
5677 err = gr_gk20a_decode_priv_addr(g, addr, &addr_type,
5678 &gpc_num, &tpc_num, &ppc_num, &be_num,
5680 gk20a_dbg(gpu_dbg_gpu_dbg, "addr_type = %d", addr_type);
5684 if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
5685 (addr_type == CTXSW_ADDR_TYPE_BE)) {
5686 /* The BE broadcast registers are included in the compressed PRI
5687 * table. Convert a BE unicast address to a broadcast address
5688 * so that we can look up the offset. */
5689 if ((addr_type == CTXSW_ADDR_TYPE_BE) &&
5690 !(broadcast_flags & PRI_BROADCAST_FLAGS_BE))
5691 priv_addr_table[t++] = pri_be_shared_addr(addr);
5693 priv_addr_table[t++] = addr;
5699 /* The GPC/TPC unicast registers are included in the compressed PRI
5700 * tables. Convert a GPC/TPC broadcast address to unicast addresses so
5701 * that we can look up the offsets. */
5702 if (broadcast_flags & PRI_BROADCAST_FLAGS_GPC) {
5703 for (gpc_num = 0; gpc_num < g->gr.gpc_count; gpc_num++) {
5705 if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
5707 tpc_num < g->gr.gpc_tpc_count[gpc_num];
5709 priv_addr_table[t++] =
5710 pri_tpc_addr(pri_tpccs_addr_mask(addr),
5713 else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC) {
5714 err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
5715 priv_addr_table, &t);
5719 priv_addr_table[t++] =
5720 pri_gpc_addr(pri_gpccs_addr_mask(addr),
5724 if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
5726 tpc_num < g->gr.gpc_tpc_count[gpc_num];
5728 priv_addr_table[t++] =
5729 pri_tpc_addr(pri_tpccs_addr_mask(addr),
5731 else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC)
5732 err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
5733 priv_addr_table, &t);
5735 priv_addr_table[t++] = addr;
5742 int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
5745 u32 *offsets, u32 *offset_addrs,
5747 bool is_quad, u32 quad)
5750 u32 priv_offset = 0;
5751 u32 *priv_registers;
5752 u32 num_registers = 0;
5754 u32 potential_offsets = proj_scal_litter_num_gpcs_v() *
5755 proj_scal_litter_num_tpc_per_gpc_v();
5757 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
5759 /* implementation is crossed-up if either of these happen */
5760 if (max_offsets > potential_offsets)
5763 if (!g->gr.ctx_vars.golden_image_initialized)
5766 priv_registers = kzalloc(sizeof(u32) * potential_offsets, GFP_KERNEL);
5767 if (IS_ERR_OR_NULL(priv_registers)) {
5768 gk20a_dbg_fn("failed alloc for potential_offsets=%d", potential_offsets);
5769 err = PTR_ERR(priv_registers);
5772 memset(offsets, 0, sizeof(u32) * max_offsets);
5773 memset(offset_addrs, 0, sizeof(u32) * max_offsets);
5776 gr_gk20a_create_priv_addr_table(g, addr, &priv_registers[0], &num_registers);
5778 if ((max_offsets > 1) && (num_registers > max_offsets)) {
5783 if ((max_offsets == 1) && (num_registers > 1))
5786 if (!g->gr.ctx_vars.local_golden_image) {
5787 gk20a_dbg_fn("no context switch header info to work with");
5792 for (i = 0; i < num_registers; i++) {
5793 err = gr_gk20a_find_priv_offset_in_buffer(g,
5796 g->gr.ctx_vars.local_golden_image,
5797 g->gr.ctx_vars.golden_image_size,
5800 gk20a_dbg_fn("Could not determine priv_offset for addr:0x%x",
5801 addr); /*, grPriRegStr(addr)));*/
5805 offsets[i] = priv_offset;
5806 offset_addrs[i] = priv_registers[i];
5809 *num_offsets = num_registers;
5813 if (!IS_ERR_OR_NULL(priv_registers))
5814 kfree(priv_registers);
5819 /* Setup some register tables. This looks hacky; our
5820 * register/offset functions are just that, functions.
5821 * So they can't be used as initializers... TBD: fix to
5822 * generate consts at least on an as-needed basis.
5824 static const u32 _num_ovr_perf_regs = 17;
5825 static u32 _ovr_perf_regs[17] = { 0, };
5826 /* Following are the blocks of registers that the ucode
5827 stores in the extended region.*/
5828 /* == ctxsw_extended_sm_dsm_perf_counter_register_stride_v() ? */
5829 static const u32 _num_sm_dsm_perf_regs = 5;
5830 /* == ctxsw_extended_sm_dsm_perf_counter_control_register_stride_v() ?*/
5831 static const u32 _num_sm_dsm_perf_ctrl_regs = 4;
5832 static u32 _sm_dsm_perf_regs[5];
5833 static u32 _sm_dsm_perf_ctrl_regs[4];
5835 static void init_sm_dsm_reg_info(void)
5837 if (_ovr_perf_regs[0] != 0)
5840 _ovr_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel0_r();
5841 _ovr_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel1_r();
5842 _ovr_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control0_r();
5843 _ovr_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control5_r();
5844 _ovr_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status1_r();
5845 _ovr_perf_regs[5] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_control_r();
5846 _ovr_perf_regs[6] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_control_r();
5847 _ovr_perf_regs[7] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_control_r();
5848 _ovr_perf_regs[8] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_control_r();
5849 _ovr_perf_regs[9] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_control_r();
5850 _ovr_perf_regs[10] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_control_r();
5851 _ovr_perf_regs[11] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_control_r();
5852 _ovr_perf_regs[12] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_control_r();
5853 _ovr_perf_regs[13] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_r();
5854 _ovr_perf_regs[14] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_r();
5855 _ovr_perf_regs[15] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_r();
5856 _ovr_perf_regs[16] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_r();
5859 _sm_dsm_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status_r();
5860 _sm_dsm_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_r();
5861 _sm_dsm_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_r();
5862 _sm_dsm_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_r();
5863 _sm_dsm_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_r();
5865 _sm_dsm_perf_ctrl_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control1_r();
5866 _sm_dsm_perf_ctrl_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control2_r();
5867 _sm_dsm_perf_ctrl_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control3_r();
5868 _sm_dsm_perf_ctrl_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control4_r();
5872 /* TBD: would like to handle this elsewhere, at a higher level.
5873 * these are currently constructed in a "test-then-write" style
5874 * which makes it impossible to know externally whether a ctx
5875 * write will actually occur. so later we should put a lazy,
5876 * map-and-hold system in the patch write state */
5877 int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
5878 struct channel_ctx_gk20a *ch_ctx,
5882 u32 num_gpc = g->gr.gpc_count;
5890 init_sm_dsm_reg_info();
5892 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
5894 for (reg = 0; reg < _num_ovr_perf_regs; reg++) {
5895 for (gpc = 0; gpc < num_gpc; gpc++) {
5896 num_tpc = g->gr.gpc_tpc_count[gpc];
5897 for (tpc = 0; tpc < num_tpc; tpc++) {
5898 chk_addr = ((proj_gpc_stride_v() * gpc) +
5899 (proj_tpc_in_gpc_stride_v() * tpc) +
5900 _ovr_perf_regs[reg]);
5901 if (chk_addr != addr)
5903 /* reset the patch count from previous
5904 runs,if ucode has already processed
5906 tmp = gk20a_mem_rd32(context +
5907 ctxsw_prog_main_image_patch_count_o(), 0);
5910 ch_ctx->patch_ctx.data_count = 0;
5912 gr_gk20a_ctx_patch_write(g, ch_ctx,
5915 vaddr_lo = u64_lo32(ch_ctx->patch_ctx.gpu_va);
5916 vaddr_hi = u64_hi32(ch_ctx->patch_ctx.gpu_va);
5918 gk20a_mem_wr32(context +
5919 ctxsw_prog_main_image_patch_count_o(),
5920 0, ch_ctx->patch_ctx.data_count);
5921 gk20a_mem_wr32(context +
5922 ctxsw_prog_main_image_patch_adr_lo_o(),
5924 gk20a_mem_wr32(context +
5925 ctxsw_prog_main_image_patch_adr_hi_o(),
5928 /* we're not caching these on cpu side,
5929 but later watch for it */
5938 static void gr_gk20a_access_smpc_reg(struct gk20a *g, u32 quad, u32 offset)
5947 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "offset=0x%x", offset);
5949 gpc = pri_get_gpc_num(offset);
5950 gpc_tpc_addr = pri_gpccs_addr_mask(offset);
5951 tpc = pri_get_tpc_num(gpc_tpc_addr);
5953 quad_ctrl = quad & 0x1; /* first bit tells us quad */
5954 half_ctrl = (quad >> 1) & 0x1; /* second bit tells us half */
5956 gpc_tpc_stride = gpc * proj_gpc_stride_v() +
5957 tpc * proj_tpc_in_gpc_stride_v();
5958 gpc_tpc_addr = gr_gpc0_tpc0_sm_halfctl_ctrl_r() + gpc_tpc_stride;
5960 reg = gk20a_readl(g, gpc_tpc_addr);
5961 reg = set_field(reg,
5962 gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_read_quad_ctl_m(),
5963 gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_read_quad_ctl_f(quad_ctrl));
5965 gk20a_writel(g, gpc_tpc_addr, reg);
5967 gpc_tpc_addr = gr_gpc0_tpc0_sm_debug_sfe_control_r() + gpc_tpc_stride;
5968 reg = gk20a_readl(g, gpc_tpc_addr);
5969 reg = set_field(reg,
5970 gr_gpcs_tpcs_sm_debug_sfe_control_read_half_ctl_m(),
5971 gr_gpcs_tpcs_sm_debug_sfe_control_read_half_ctl_f(half_ctrl));
5972 gk20a_writel(g, gpc_tpc_addr, reg);
5975 #define ILLEGAL_ID (~0)
5977 static inline bool check_main_image_header_magic(void *context)
5979 u32 magic = gk20a_mem_rd32(context +
5980 ctxsw_prog_main_image_magic_value_o(), 0);
5981 gk20a_dbg(gpu_dbg_gpu_dbg, "main image magic=0x%x", magic);
5982 return magic == ctxsw_prog_main_image_magic_value_v_value_v();
5984 static inline bool check_local_header_magic(void *context)
5986 u32 magic = gk20a_mem_rd32(context +
5987 ctxsw_prog_local_magic_value_o(), 0);
5988 gk20a_dbg(gpu_dbg_gpu_dbg, "local magic=0x%x", magic);
5989 return magic == ctxsw_prog_local_magic_value_v_value_v();
5993 /* most likely dupe of ctxsw_gpccs_header__size_1_v() */
5994 static inline int ctxsw_prog_ucode_header_size_in_bytes(void)
5999 void gr_gk20a_get_sm_dsm_perf_regs(struct gk20a *g,
6000 u32 *num_sm_dsm_perf_regs,
6001 u32 **sm_dsm_perf_regs,
6002 u32 *perf_register_stride)
6004 *num_sm_dsm_perf_regs = _num_sm_dsm_perf_regs;
6005 *sm_dsm_perf_regs = _sm_dsm_perf_regs;
6006 *perf_register_stride = ctxsw_prog_extended_sm_dsm_perf_counter_register_stride_v();
6009 void gr_gk20a_get_sm_dsm_perf_ctrl_regs(struct gk20a *g,
6010 u32 *num_sm_dsm_perf_ctrl_regs,
6011 u32 **sm_dsm_perf_ctrl_regs,
6012 u32 *ctrl_register_stride)
6014 *num_sm_dsm_perf_ctrl_regs = _num_sm_dsm_perf_ctrl_regs;
6015 *sm_dsm_perf_ctrl_regs = _sm_dsm_perf_ctrl_regs;
6016 *ctrl_register_stride = ctxsw_prog_extended_sm_dsm_perf_counter_control_register_stride_v();
6019 static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
6021 bool is_quad, u32 quad,
6022 u32 *context_buffer,
6023 u32 context_buffer_size,
6027 u32 gpc_num, tpc_num;
6028 u32 num_gpcs, num_tpcs;
6030 u32 ext_priv_offset, ext_priv_size;
6032 u32 offset_to_segment, offset_to_segment_end;
6033 u32 sm_dsm_perf_reg_id = ILLEGAL_ID;
6034 u32 sm_dsm_perf_ctrl_reg_id = ILLEGAL_ID;
6035 u32 num_ext_gpccs_ext_buffer_segments;
6036 u32 inter_seg_offset;
6037 u32 tpc_gpc_mask = (proj_tpc_in_gpc_stride_v() - 1);
6039 u32 *sm_dsm_perf_ctrl_regs = NULL;
6040 u32 num_sm_dsm_perf_ctrl_regs = 0;
6041 u32 *sm_dsm_perf_regs = NULL;
6042 u32 num_sm_dsm_perf_regs = 0;
6043 u32 buffer_segments_size = 0;
6044 u32 marker_size = 0;
6045 u32 control_register_stride = 0;
6046 u32 perf_register_stride = 0;
6048 /* Only have TPC registers in extended region, so if not a TPC reg,
6049 then return error so caller can look elsewhere. */
6050 if (pri_is_gpc_addr(addr)) {
6052 gpc_num = pri_get_gpc_num(addr);
6053 gpc_addr = pri_gpccs_addr_mask(addr);
6054 if (pri_is_tpc_addr(gpc_addr))
6055 tpc_num = pri_get_tpc_num(gpc_addr);
6059 gk20a_dbg_info(" gpc = %d tpc = %d",
6064 buffer_segments_size = ctxsw_prog_extended_buffer_segments_size_in_bytes_v();
6065 /* note below is in words/num_registers */
6066 marker_size = ctxsw_prog_extended_marker_size_in_bytes_v() >> 2;
6068 context = context_buffer;
6069 /* sanity check main header */
6070 if (!check_main_image_header_magic(context)) {
6071 gk20a_err(dev_from_gk20a(g),
6072 "Invalid main header: magic value");
6075 num_gpcs = gk20a_mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0);
6076 if (gpc_num >= num_gpcs) {
6077 gk20a_err(dev_from_gk20a(g),
6078 "GPC 0x%08x is greater than total count 0x%08x!\n",
6083 data32 = gk20a_mem_rd32(context + ctxsw_prog_main_extended_buffer_ctl_o(), 0);
6084 ext_priv_size = ctxsw_prog_main_extended_buffer_ctl_size_v(data32);
6085 if (0 == ext_priv_size) {
6086 gk20a_dbg_info(" No extended memory in context buffer");
6089 ext_priv_offset = ctxsw_prog_main_extended_buffer_ctl_offset_v(data32);
6091 offset_to_segment = ext_priv_offset * ctxsw_prog_ucode_header_size_in_bytes();
6092 offset_to_segment_end = offset_to_segment +
6093 (ext_priv_size * buffer_segments_size);
6095 /* check local header magic */
6096 context += ctxsw_prog_ucode_header_size_in_bytes();
6097 if (!check_local_header_magic(context)) {
6098 gk20a_err(dev_from_gk20a(g),
6099 "Invalid local header: magic value\n");
6104 * See if the incoming register address is in the first table of
6105 * registers. We check this by decoding only the TPC addr portion.
6106 * If we get a hit on the TPC bit, we then double check the address
6107 * by computing it from the base gpc/tpc strides. Then make sure
6108 * it is a real match.
6110 g->ops.gr.get_sm_dsm_perf_regs(g, &num_sm_dsm_perf_regs,
6112 &perf_register_stride);
6114 init_sm_dsm_reg_info();
6116 for (i = 0; i < num_sm_dsm_perf_regs; i++) {
6117 if ((addr & tpc_gpc_mask) == (sm_dsm_perf_regs[i] & tpc_gpc_mask)) {
6118 sm_dsm_perf_reg_id = i;
6120 gk20a_dbg_info("register match: 0x%08x",
6121 sm_dsm_perf_regs[i]);
6123 chk_addr = (proj_gpc_base_v() +
6124 (proj_gpc_stride_v() * gpc_num) +
6125 proj_tpc_in_gpc_base_v() +
6126 (proj_tpc_in_gpc_stride_v() * tpc_num) +
6127 (sm_dsm_perf_regs[sm_dsm_perf_reg_id] & tpc_gpc_mask));
6129 if (chk_addr != addr) {
6130 gk20a_err(dev_from_gk20a(g),
6131 "Oops addr miss-match! : 0x%08x != 0x%08x\n",
6139 /* Didn't find reg in supported group 1.
6140 * so try the second group now */
6141 g->ops.gr.get_sm_dsm_perf_ctrl_regs(g, &num_sm_dsm_perf_ctrl_regs,
6142 &sm_dsm_perf_ctrl_regs,
6143 &control_register_stride);
6145 if (ILLEGAL_ID == sm_dsm_perf_reg_id) {
6146 for (i = 0; i < num_sm_dsm_perf_ctrl_regs; i++) {
6147 if ((addr & tpc_gpc_mask) ==
6148 (sm_dsm_perf_ctrl_regs[i] & tpc_gpc_mask)) {
6149 sm_dsm_perf_ctrl_reg_id = i;
6151 gk20a_dbg_info("register match: 0x%08x",
6152 sm_dsm_perf_ctrl_regs[i]);
6154 chk_addr = (proj_gpc_base_v() +
6155 (proj_gpc_stride_v() * gpc_num) +
6156 proj_tpc_in_gpc_base_v() +
6157 (proj_tpc_in_gpc_stride_v() * tpc_num) +
6158 (sm_dsm_perf_ctrl_regs[sm_dsm_perf_ctrl_reg_id] &
6161 if (chk_addr != addr) {
6162 gk20a_err(dev_from_gk20a(g),
6163 "Oops addr miss-match! : 0x%08x != 0x%08x\n",
6174 if ((ILLEGAL_ID == sm_dsm_perf_ctrl_reg_id) &&
6175 (ILLEGAL_ID == sm_dsm_perf_reg_id))
6178 /* Skip the FECS extended header, nothing there for us now. */
6179 offset_to_segment += buffer_segments_size;
6181 /* skip through the GPCCS extended headers until we get to the data for
6182 * our GPC. The size of each gpc extended segment is enough to hold the
6183 * max tpc count for the gpcs,in 256b chunks.
6186 max_tpc_count = proj_scal_litter_num_tpc_per_gpc_v();
6188 num_ext_gpccs_ext_buffer_segments = (u32)((max_tpc_count + 1) / 2);
6190 offset_to_segment += (num_ext_gpccs_ext_buffer_segments *
6191 buffer_segments_size * gpc_num);
6193 num_tpcs = g->gr.gpc_tpc_count[gpc_num];
6195 /* skip the head marker to start with */
6196 inter_seg_offset = marker_size;
6198 if (ILLEGAL_ID != sm_dsm_perf_ctrl_reg_id) {
6199 /* skip over control regs of TPC's before the one we want.
6200 * then skip to the register in this tpc */
6201 inter_seg_offset = inter_seg_offset +
6202 (tpc_num * control_register_stride) +
6203 sm_dsm_perf_ctrl_reg_id;
6205 /* skip all the control registers */
6206 inter_seg_offset = inter_seg_offset +
6207 (num_tpcs * control_register_stride);
6209 /* skip the marker between control and counter segments */
6210 inter_seg_offset += marker_size;
6212 /* skip over counter regs of TPCs before the one we want */
6213 inter_seg_offset = inter_seg_offset +
6214 (tpc_num * perf_register_stride) *
6215 ctxsw_prog_extended_num_smpc_quadrants_v();
6217 /* skip over the register for the quadrants we do not want.
6218 * then skip to the register in this tpc */
6219 inter_seg_offset = inter_seg_offset +
6220 (perf_register_stride * quad) +
6224 /* set the offset to the segment offset plus the inter segment offset to
6226 offset_to_segment += (inter_seg_offset * 4);
6228 /* last sanity check: did we somehow compute an offset outside the
6229 * extended buffer? */
6230 if (offset_to_segment > offset_to_segment_end) {
6231 gk20a_err(dev_from_gk20a(g),
6232 "Overflow ctxsw buffer! 0x%08x > 0x%08x\n",
6233 offset_to_segment, offset_to_segment_end);
6237 *priv_offset = offset_to_segment;
6244 gr_gk20a_process_context_buffer_priv_segment(struct gk20a *g,
6245 int addr_type,/* enum ctxsw_addr_type */
6247 u32 gpc_num, u32 num_tpcs,
6248 u32 num_ppcs, u32 ppc_mask,
6252 u32 address, base_address;
6253 u32 sys_offset, gpc_offset, tpc_offset, ppc_offset;
6254 u32 ppc_num, tpc_num, tpc_addr, gpc_addr, ppc_addr;
6255 struct aiv_gk20a *reg;
6257 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "pri_addr=0x%x", pri_addr);
6259 if (!g->gr.ctx_vars.valid)
6262 /* Process the SYS/BE segment. */
6263 if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
6264 (addr_type == CTXSW_ADDR_TYPE_BE)) {
6265 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.sys.count; i++) {
6266 reg = &g->gr.ctx_vars.ctxsw_regs.sys.l[i];
6267 address = reg->addr;
6268 sys_offset = reg->index;
6270 if (pri_addr == address) {
6271 *priv_offset = sys_offset;
6277 /* Process the TPC segment. */
6278 if (addr_type == CTXSW_ADDR_TYPE_TPC) {
6279 for (tpc_num = 0; tpc_num < num_tpcs; tpc_num++) {
6280 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.tpc.count; i++) {
6281 reg = &g->gr.ctx_vars.ctxsw_regs.tpc.l[i];
6282 address = reg->addr;
6283 tpc_addr = pri_tpccs_addr_mask(address);
6284 base_address = proj_gpc_base_v() +
6285 (gpc_num * proj_gpc_stride_v()) +
6286 proj_tpc_in_gpc_base_v() +
6287 (tpc_num * proj_tpc_in_gpc_stride_v());
6288 address = base_address + tpc_addr;
6290 * The data for the TPCs is interleaved in the context buffer.
6291 * Example with num_tpcs = 2
6292 * 0 1 2 3 4 5 6 7 8 9 10 11 ...
6293 * 0-0 1-0 0-1 1-1 0-2 1-2 0-3 1-3 0-4 1-4 0-5 1-5 ...
6295 tpc_offset = (reg->index * num_tpcs) + (tpc_num * 4);
6297 if (pri_addr == address) {
6298 *priv_offset = tpc_offset;
6305 /* Process the PPC segment. */
6306 if (addr_type == CTXSW_ADDR_TYPE_PPC) {
6307 for (ppc_num = 0; ppc_num < num_ppcs; ppc_num++) {
6308 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.ppc.count; i++) {
6309 reg = &g->gr.ctx_vars.ctxsw_regs.ppc.l[i];
6310 address = reg->addr;
6311 ppc_addr = pri_ppccs_addr_mask(address);
6312 base_address = proj_gpc_base_v() +
6313 (gpc_num * proj_gpc_stride_v()) +
6314 proj_ppc_in_gpc_base_v() +
6315 (ppc_num * proj_ppc_in_gpc_stride_v());
6316 address = base_address + ppc_addr;
6318 * The data for the PPCs is interleaved in the context buffer.
6319 * Example with numPpcs = 2
6320 * 0 1 2 3 4 5 6 7 8 9 10 11 ...
6321 * 0-0 1-0 0-1 1-1 0-2 1-2 0-3 1-3 0-4 1-4 0-5 1-5 ...
6323 ppc_offset = (reg->index * num_ppcs) + (ppc_num * 4);
6325 if (pri_addr == address) {
6326 *priv_offset = ppc_offset;
6334 /* Process the GPC segment. */
6335 if (addr_type == CTXSW_ADDR_TYPE_GPC) {
6336 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.gpc.count; i++) {
6337 reg = &g->gr.ctx_vars.ctxsw_regs.gpc.l[i];
6339 address = reg->addr;
6340 gpc_addr = pri_gpccs_addr_mask(address);
6341 gpc_offset = reg->index;
6343 base_address = proj_gpc_base_v() +
6344 (gpc_num * proj_gpc_stride_v());
6345 address = base_address + gpc_addr;
6347 if (pri_addr == address) {
6348 *priv_offset = gpc_offset;
6357 static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
6359 u32 *num_ppcs, u32 *ppc_mask,
6363 u32 litter_num_pes_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
6366 * if there is only 1 PES_PER_GPC, then we put the PES registers
6367 * in the GPC reglist, so we can't error out if ppc.count == 0
6369 if ((!g->gr.ctx_vars.valid) ||
6370 ((g->gr.ctx_vars.ctxsw_regs.ppc.count == 0) &&
6371 (litter_num_pes_per_gpc > 1)))
6374 data32 = gk20a_mem_rd32(context + ctxsw_prog_local_image_ppc_info_o(), 0);
6376 *num_ppcs = ctxsw_prog_local_image_ppc_info_num_ppcs_v(data32);
6377 *ppc_mask = ctxsw_prog_local_image_ppc_info_ppc_mask_v(data32);
6379 *reg_ppc_count = g->gr.ctx_vars.ctxsw_regs.ppc.count;
6387 * This function will return the 32 bit offset for a priv register if it is
6388 * present in the context buffer.
6390 static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
6392 bool is_quad, u32 quad,
6393 u32 *context_buffer,
6394 u32 context_buffer_size,
6397 struct gr_gk20a *gr = &g->gr;
6400 int addr_type; /*enum ctxsw_addr_type */
6401 u32 broadcast_flags;
6402 u32 gpc_num, tpc_num, ppc_num, be_num;
6403 u32 num_gpcs, num_tpcs, num_ppcs;
6405 u32 sys_priv_offset, gpc_priv_offset;
6406 u32 ppc_mask, reg_list_ppc_count;
6408 u32 offset_to_segment;
6410 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
6412 err = gr_gk20a_decode_priv_addr(g, addr, &addr_type,
6413 &gpc_num, &tpc_num, &ppc_num, &be_num,
6418 context = context_buffer;
6419 if (!check_main_image_header_magic(context)) {
6420 gk20a_err(dev_from_gk20a(g),
6421 "Invalid main header: magic value");
6424 num_gpcs = gk20a_mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0);
6426 /* Parse the FECS local header. */
6427 context += ctxsw_prog_ucode_header_size_in_bytes();
6428 if (!check_local_header_magic(context)) {
6429 gk20a_err(dev_from_gk20a(g),
6430 "Invalid FECS local header: magic value\n");
6433 data32 = gk20a_mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0);
6434 sys_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
6436 /* If found in Ext buffer, ok.
6437 * If it failed and we expected to find it there (quad offset)
6438 * then return the error. Otherwise continue on.
6440 err = gr_gk20a_find_priv_offset_in_ext_buffer(g,
6441 addr, is_quad, quad, context_buffer,
6442 context_buffer_size, priv_offset);
6443 if (!err || (err && is_quad))
6446 if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
6447 (addr_type == CTXSW_ADDR_TYPE_BE)) {
6448 /* Find the offset in the FECS segment. */
6449 offset_to_segment = sys_priv_offset *
6450 ctxsw_prog_ucode_header_size_in_bytes();
6452 err = gr_gk20a_process_context_buffer_priv_segment(g,
6459 *priv_offset = (offset_to_segment + offset);
6463 if ((gpc_num + 1) > num_gpcs) {
6464 gk20a_err(dev_from_gk20a(g),
6465 "GPC %d not in this context buffer.\n",
6470 /* Parse the GPCCS local header(s).*/
6471 for (i = 0; i < num_gpcs; i++) {
6472 context += ctxsw_prog_ucode_header_size_in_bytes();
6473 if (!check_local_header_magic(context)) {
6474 gk20a_err(dev_from_gk20a(g),
6475 "Invalid GPCCS local header: magic value\n");
6479 data32 = gk20a_mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0);
6480 gpc_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
6482 err = gr_gk20a_determine_ppc_configuration(g, context,
6483 &num_ppcs, &ppc_mask,
6484 ®_list_ppc_count);
6488 num_tpcs = gk20a_mem_rd32(context + ctxsw_prog_local_image_num_tpcs_o(), 0);
6490 if ((i == gpc_num) && ((tpc_num + 1) > num_tpcs)) {
6491 gk20a_err(dev_from_gk20a(g),
6492 "GPC %d TPC %d not in this context buffer.\n",
6497 /* Find the offset in the GPCCS segment.*/
6499 offset_to_segment = gpc_priv_offset *
6500 ctxsw_prog_ucode_header_size_in_bytes();
6502 if (addr_type == CTXSW_ADDR_TYPE_TPC) {
6503 /*reg = gr->ctx_vars.ctxsw_regs.tpc.l;*/
6504 } else if (addr_type == CTXSW_ADDR_TYPE_PPC) {
6505 /* The ucode stores TPC data before PPC data.
6506 * Advance offset past TPC data to PPC data. */
6507 offset_to_segment +=
6508 ((gr->ctx_vars.ctxsw_regs.tpc.count *
6510 } else if (addr_type == CTXSW_ADDR_TYPE_GPC) {
6511 /* The ucode stores TPC/PPC data before GPC data.
6512 * Advance offset past TPC/PPC data to GPC data. */
6513 /* note 1 PES_PER_GPC case */
6514 u32 litter_num_pes_per_gpc =
6515 proj_scal_litter_num_pes_per_gpc_v();
6516 if (litter_num_pes_per_gpc > 1) {
6517 offset_to_segment +=
6518 (((gr->ctx_vars.ctxsw_regs.tpc.count *
6520 ((reg_list_ppc_count * num_ppcs) << 2));
6522 offset_to_segment +=
6523 ((gr->ctx_vars.ctxsw_regs.tpc.count *
6527 gk20a_err(dev_from_gk20a(g),
6528 " Unknown address type.\n");
6531 err = gr_gk20a_process_context_buffer_priv_segment(g,
6539 *priv_offset = offset_to_segment + offset;
6548 int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
6549 struct nvhost_dbg_gpu_reg_op *ctx_ops, u32 num_ops,
6550 u32 num_ctx_wr_ops, u32 num_ctx_rd_ops)
6552 struct gk20a *g = ch->g;
6553 struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
6554 void *ctx_ptr = NULL;
6555 int curr_gr_chid, curr_gr_ctx;
6556 bool ch_is_curr_ctx, restart_gr_ctxsw = false;
6557 u32 i, j, offset, v;
6558 u32 max_offsets = proj_scal_litter_num_gpcs_v() *
6559 proj_scal_litter_num_tpc_per_gpc_v();
6560 u32 *offsets = NULL;
6561 u32 *offset_addrs = NULL;
6562 u32 ctx_op_nr, num_ctx_ops[2] = {num_ctx_wr_ops, num_ctx_rd_ops};
6565 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "wr_ops=%d rd_ops=%d",
6566 num_ctx_wr_ops, num_ctx_rd_ops);
6568 /* disable channel switching.
6569 * at that point the hardware state can be inspected to
6570 * determine if the context we're interested in is current.
6572 err = gr_gk20a_disable_ctxsw(g);
6574 gk20a_err(dev_from_gk20a(g), "unable to stop gr ctxsw");
6575 /* this should probably be ctx-fatal... */
6579 restart_gr_ctxsw = true;
6581 curr_gr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
6582 curr_gr_chid = gk20a_gr_get_chid_from_ctx(g, curr_gr_ctx);
6583 ch_is_curr_ctx = (curr_gr_chid != -1) && (ch->hw_chid == curr_gr_chid);
6585 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "is curr ctx=%d", ch_is_curr_ctx);
6586 if (ch_is_curr_ctx) {
6587 for (pass = 0; pass < 2; pass++) {
6589 for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
6590 /* only do ctx ops and only on the right pass */
6591 if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
6592 (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
6593 ((pass == 1) && !reg_op_is_read(ctx_ops[i].op))))
6596 /* if this is a quad access, setup for special access*/
6597 if (ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD)
6598 && g->ops.gr.access_smpc_reg)
6599 g->ops.gr.access_smpc_reg(g,
6602 offset = ctx_ops[i].offset;
6604 if (pass == 0) { /* write pass */
6605 v = gk20a_readl(g, offset);
6606 v &= ~ctx_ops[i].and_n_mask_lo;
6607 v |= ctx_ops[i].value_lo;
6608 gk20a_writel(g, offset, v);
6610 gk20a_dbg(gpu_dbg_gpu_dbg,
6611 "direct wr: offset=0x%x v=0x%x",
6614 if (ctx_ops[i].op == REGOP(WRITE_64)) {
6615 v = gk20a_readl(g, offset + 4);
6616 v &= ~ctx_ops[i].and_n_mask_hi;
6617 v |= ctx_ops[i].value_hi;
6618 gk20a_writel(g, offset + 4, v);
6620 gk20a_dbg(gpu_dbg_gpu_dbg,
6621 "direct wr: offset=0x%x v=0x%x",
6625 } else { /* read pass */
6626 ctx_ops[i].value_lo =
6627 gk20a_readl(g, offset);
6629 gk20a_dbg(gpu_dbg_gpu_dbg,
6630 "direct rd: offset=0x%x v=0x%x",
6631 offset, ctx_ops[i].value_lo);
6633 if (ctx_ops[i].op == REGOP(READ_64)) {
6634 ctx_ops[i].value_hi =
6635 gk20a_readl(g, offset + 4);
6637 gk20a_dbg(gpu_dbg_gpu_dbg,
6638 "direct rd: offset=0x%x v=0x%x",
6639 offset, ctx_ops[i].value_lo);
6641 ctx_ops[i].value_hi = 0;
6649 /* they're the same size, so just use one alloc for both */
6650 offsets = kzalloc(2 * sizeof(u32) * max_offsets, GFP_KERNEL);
6655 offset_addrs = offsets + max_offsets;
6657 /* would have been a variant of gr_gk20a_apply_instmem_overrides */
6658 /* recoded in-place instead.*/
6659 ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
6660 PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
6661 0, pgprot_dmacoherent(PAGE_KERNEL));
6667 gk20a_mm_l2_flush(g, true);
6669 /* write to appropriate place in context image,
6670 * first have to figure out where that really is */
6672 /* first pass is writes, second reads */
6673 for (pass = 0; pass < 2; pass++) {
6675 for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
6678 /* only do ctx ops and only on the right pass */
6679 if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
6680 (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
6681 ((pass == 1) && !reg_op_is_read(ctx_ops[i].op))))
6684 err = gr_gk20a_get_ctx_buffer_offsets(g,
6687 offsets, offset_addrs,
6689 ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD),
6692 gk20a_dbg(gpu_dbg_gpu_dbg,
6693 "ctx op invalid offset: offset=0x%x",
6696 NVHOST_DBG_GPU_REG_OP_STATUS_INVALID_OFFSET;
6700 /* if this is a quad access, setup for special access*/
6701 if (ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD) &&
6702 g->ops.gr.access_smpc_reg)
6703 g->ops.gr.access_smpc_reg(g, ctx_ops[i].quad,
6706 for (j = 0; j < num_offsets; j++) {
6707 /* sanity check, don't write outside, worst case */
6708 if (offsets[j] >= g->gr.ctx_vars.golden_image_size)
6710 if (pass == 0) { /* write pass */
6711 v = gk20a_mem_rd32(ctx_ptr + offsets[j], 0);
6712 v &= ~ctx_ops[i].and_n_mask_lo;
6713 v |= ctx_ops[i].value_lo;
6714 gk20a_mem_wr32(ctx_ptr + offsets[j], 0, v);
6716 gk20a_dbg(gpu_dbg_gpu_dbg,
6717 "context wr: offset=0x%x v=0x%x",
6720 if (ctx_ops[i].op == REGOP(WRITE_64)) {
6721 v = gk20a_mem_rd32(ctx_ptr + offsets[j] + 4, 0);
6722 v &= ~ctx_ops[i].and_n_mask_hi;
6723 v |= ctx_ops[i].value_hi;
6724 gk20a_mem_wr32(ctx_ptr + offsets[j] + 4, 0, v);
6726 gk20a_dbg(gpu_dbg_gpu_dbg,
6727 "context wr: offset=0x%x v=0x%x",
6731 /* check to see if we need to add a special WAR
6732 for some of the SMPC perf regs */
6733 gr_gk20a_ctx_patch_smpc(g, ch_ctx, offset_addrs[j],
6736 } else { /* read pass */
6737 ctx_ops[i].value_lo =
6738 gk20a_mem_rd32(ctx_ptr + offsets[0], 0);
6740 gk20a_dbg(gpu_dbg_gpu_dbg, "context rd: offset=0x%x v=0x%x",
6741 offsets[0], ctx_ops[i].value_lo);
6743 if (ctx_ops[i].op == REGOP(READ_64)) {
6744 ctx_ops[i].value_hi =
6745 gk20a_mem_rd32(ctx_ptr + offsets[0] + 4, 0);
6747 gk20a_dbg(gpu_dbg_gpu_dbg,
6748 "context rd: offset=0x%x v=0x%x",
6749 offsets[0] + 4, ctx_ops[i].value_hi);
6751 ctx_ops[i].value_hi = 0;
6758 /* flush cpu caches for the ctx buffer? only if cpu cached, of course.
6759 * they aren't, yet */
6761 FLUSH_CPU_DCACHE(ctx_ptr,
6762 sg_phys(ch_ctx->gr_ctx.mem.ref), size);
6773 if (restart_gr_ctxsw) {
6774 int tmp_err = gr_gk20a_enable_ctxsw(g);
6776 gk20a_err(dev_from_gk20a(g), "unable to restart ctxsw!\n");
6784 static void gr_gk20a_cb_size_default(struct gk20a *g)
6786 struct gr_gk20a *gr = &g->gr;
6788 gr->attrib_cb_default_size =
6789 gr_gpc0_ppc0_cbm_cfg_size_default_v();
6790 gr->alpha_cb_default_size =
6791 gr_gpc0_ppc0_cbm_cfg2_size_default_v();
6794 static int gr_gk20a_calc_global_ctx_buffer_size(struct gk20a *g)
6796 struct gr_gk20a *gr = &g->gr;
6799 gr->attrib_cb_size = gr->attrib_cb_default_size;
6800 gr->alpha_cb_size = gr->alpha_cb_default_size
6801 + (gr->alpha_cb_default_size >> 1);
6803 size = gr->attrib_cb_size *
6804 gr_gpc0_ppc0_cbm_cfg_size_granularity_v() *
6807 size += gr->alpha_cb_size *
6808 gr_gpc0_ppc0_cbm_cfg2_size_granularity_v() *
6814 void gr_gk20a_commit_global_pagepool(struct gk20a *g,
6815 struct channel_ctx_gk20a *ch_ctx,
6816 u64 addr, u32 size, bool patch)
6818 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_base_r(),
6819 gr_scc_pagepool_base_addr_39_8_f(addr), patch);
6821 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_r(),
6822 gr_scc_pagepool_total_pages_f(size) |
6823 gr_scc_pagepool_valid_true_f(), patch);
6825 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_base_r(),
6826 gr_gpcs_gcc_pagepool_base_addr_39_8_f(addr), patch);
6828 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_r(),
6829 gr_gpcs_gcc_pagepool_total_pages_f(size), patch);
6831 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_pagepool_r(),
6832 gr_pd_pagepool_total_pages_f(size) |
6833 gr_pd_pagepool_valid_true_f(), patch);
6836 void gk20a_init_gr(struct gk20a *g)
6838 init_waitqueue_head(&g->gr.init_wq);
6841 void gk20a_init_gr_ops(struct gpu_ops *gops)
6843 gops->gr.access_smpc_reg = gr_gk20a_access_smpc_reg;
6844 gops->gr.bundle_cb_defaults = gr_gk20a_bundle_cb_defaults;
6845 gops->gr.cb_size_default = gr_gk20a_cb_size_default;
6846 gops->gr.calc_global_ctx_buffer_size =
6847 gr_gk20a_calc_global_ctx_buffer_size;
6848 gops->gr.commit_global_attrib_cb = gr_gk20a_commit_global_attrib_cb;
6849 gops->gr.commit_global_bundle_cb = gr_gk20a_commit_global_bundle_cb;
6850 gops->gr.commit_global_cb_manager = gr_gk20a_commit_global_cb_manager;
6851 gops->gr.commit_global_pagepool = gr_gk20a_commit_global_pagepool;
6852 gops->gr.handle_sw_method = gr_gk20a_handle_sw_method;
6853 gops->gr.set_alpha_circular_buffer_size =
6854 gk20a_gr_set_circular_buffer_size;
6855 gops->gr.set_circular_buffer_size =
6856 gk20a_gr_set_alpha_circular_buffer_size;
6857 gops->gr.enable_hww_exceptions = gr_gk20a_enable_hww_exceptions;
6858 gops->gr.is_valid_class = gr_gk20a_is_valid_class;
6859 gops->gr.get_sm_dsm_perf_regs = gr_gk20a_get_sm_dsm_perf_regs;
6860 gops->gr.get_sm_dsm_perf_ctrl_regs = gr_gk20a_get_sm_dsm_perf_ctrl_regs;
6861 gops->gr.init_fs_state = gr_gk20a_ctx_state_floorsweep;
6862 gops->gr.set_hww_esr_report_mask = gr_gk20a_set_hww_esr_report_mask;
6863 gops->gr.setup_alpha_beta_tables = gr_gk20a_setup_alpha_beta_tables;