4 * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
20 #include <linux/delay.h> /* for udelay */
21 #include <linux/mm.h> /* for totalram_pages */
22 #include <linux/scatterlist.h>
23 #include <linux/tegra-soc.h>
24 #include <linux/debugfs.h>
25 #include <uapi/linux/nvgpu.h>
26 #include <linux/vmalloc.h>
27 #include <linux/dma-mapping.h>
28 #include <linux/firmware.h>
29 #include <linux/nvhost.h>
30 #include <trace/events/gk20a.h>
33 #include "kind_gk20a.h"
34 #include "gr_ctx_gk20a.h"
36 #include "hw_ccsr_gk20a.h"
37 #include "hw_ctxsw_prog_gk20a.h"
38 #include "hw_fifo_gk20a.h"
39 #include "hw_gr_gk20a.h"
40 #include "hw_gmmu_gk20a.h"
41 #include "hw_mc_gk20a.h"
42 #include "hw_ram_gk20a.h"
43 #include "hw_pri_ringmaster_gk20a.h"
44 #include "hw_pri_ringstation_sys_gk20a.h"
45 #include "hw_pri_ringstation_gpc_gk20a.h"
46 #include "hw_pri_ringstation_fbp_gk20a.h"
47 #include "hw_proj_gk20a.h"
48 #include "hw_top_gk20a.h"
49 #include "hw_ltc_gk20a.h"
50 #include "hw_fb_gk20a.h"
51 #include "hw_therm_gk20a.h"
52 #include "hw_pbdma_gk20a.h"
53 #include "gr_pri_gk20a.h"
54 #include "regops_gk20a.h"
55 #include "dbg_gpu_gk20a.h"
56 #include "debug_gk20a.h"
57 #include "semaphore_gk20a.h"
58 #include "platform_gk20a.h"
60 #define BLK_SIZE (256)
62 static int gk20a_init_gr_bind_fecs_elpg(struct gk20a *g);
63 static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va);
65 /* global ctx buffer */
66 static int gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g);
67 static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g);
68 static int gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
69 struct channel_gk20a *c);
70 static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c);
72 /* channel gr ctx buffer */
73 static int gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
74 struct channel_gk20a *c,
75 u32 class, u32 padding);
76 static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c);
78 /* channel patch ctx buffer */
79 static int gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
80 struct channel_gk20a *c);
81 static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c);
83 /* golden ctx image */
84 static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
85 struct channel_gk20a *c);
87 static void gr_gk20a_enable_elcg(struct gk20a *g);
90 static int gk20a_gr_wait_for_sm_lock_down(struct gk20a *g, u32 gpc, u32 tpc,
91 u32 global_esr_mask, bool check_errors);
93 void gk20a_fecs_dump_falcon_stats(struct gk20a *g)
97 gk20a_err(dev_from_gk20a(g), "gr_fecs_os_r : %d",
98 gk20a_readl(g, gr_fecs_os_r()));
99 gk20a_err(dev_from_gk20a(g), "gr_fecs_cpuctl_r : 0x%x",
100 gk20a_readl(g, gr_fecs_cpuctl_r()));
101 gk20a_err(dev_from_gk20a(g), "gr_fecs_idlestate_r : 0x%x",
102 gk20a_readl(g, gr_fecs_idlestate_r()));
103 gk20a_err(dev_from_gk20a(g), "gr_fecs_mailbox0_r : 0x%x",
104 gk20a_readl(g, gr_fecs_mailbox0_r()));
105 gk20a_err(dev_from_gk20a(g), "gr_fecs_mailbox1_r : 0x%x",
106 gk20a_readl(g, gr_fecs_mailbox1_r()));
107 gk20a_err(dev_from_gk20a(g), "gr_fecs_irqstat_r : 0x%x",
108 gk20a_readl(g, gr_fecs_irqstat_r()));
109 gk20a_err(dev_from_gk20a(g), "gr_fecs_irqmode_r : 0x%x",
110 gk20a_readl(g, gr_fecs_irqmode_r()));
111 gk20a_err(dev_from_gk20a(g), "gr_fecs_irqmask_r : 0x%x",
112 gk20a_readl(g, gr_fecs_irqmask_r()));
113 gk20a_err(dev_from_gk20a(g), "gr_fecs_irqdest_r : 0x%x",
114 gk20a_readl(g, gr_fecs_irqdest_r()));
115 gk20a_err(dev_from_gk20a(g), "gr_fecs_debug1_r : 0x%x",
116 gk20a_readl(g, gr_fecs_debug1_r()));
117 gk20a_err(dev_from_gk20a(g), "gr_fecs_debuginfo_r : 0x%x",
118 gk20a_readl(g, gr_fecs_debuginfo_r()));
120 for (i = 0; i < gr_fecs_ctxsw_mailbox__size_1_v(); i++)
121 gk20a_err(dev_from_gk20a(g), "gr_fecs_ctxsw_mailbox_r(%d) : 0x%x",
122 i, gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(i)));
124 gk20a_err(dev_from_gk20a(g), "gr_fecs_engctl_r : 0x%x",
125 gk20a_readl(g, gr_fecs_engctl_r()));
126 gk20a_err(dev_from_gk20a(g), "gr_fecs_curctx_r : 0x%x",
127 gk20a_readl(g, gr_fecs_curctx_r()));
128 gk20a_err(dev_from_gk20a(g), "gr_fecs_nxtctx_r : 0x%x",
129 gk20a_readl(g, gr_fecs_nxtctx_r()));
131 gk20a_writel(g, gr_fecs_icd_cmd_r(),
132 gr_fecs_icd_cmd_opc_rreg_f() |
133 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_IMB));
134 gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_IMB : 0x%x",
135 gk20a_readl(g, gr_fecs_icd_rdata_r()));
137 gk20a_writel(g, gr_fecs_icd_cmd_r(),
138 gr_fecs_icd_cmd_opc_rreg_f() |
139 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_DMB));
140 gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_DMB : 0x%x",
141 gk20a_readl(g, gr_fecs_icd_rdata_r()));
143 gk20a_writel(g, gr_fecs_icd_cmd_r(),
144 gr_fecs_icd_cmd_opc_rreg_f() |
145 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CSW));
146 gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_CSW : 0x%x",
147 gk20a_readl(g, gr_fecs_icd_rdata_r()));
149 gk20a_writel(g, gr_fecs_icd_cmd_r(),
150 gr_fecs_icd_cmd_opc_rreg_f() |
151 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CTX));
152 gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_CTX : 0x%x",
153 gk20a_readl(g, gr_fecs_icd_rdata_r()));
155 gk20a_writel(g, gr_fecs_icd_cmd_r(),
156 gr_fecs_icd_cmd_opc_rreg_f() |
157 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_EXCI));
158 gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_EXCI : 0x%x",
159 gk20a_readl(g, gr_fecs_icd_rdata_r()));
161 for (i = 0; i < 4; i++) {
162 gk20a_writel(g, gr_fecs_icd_cmd_r(),
163 gr_fecs_icd_cmd_opc_rreg_f() |
164 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_PC));
165 gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_PC : 0x%x",
166 gk20a_readl(g, gr_fecs_icd_rdata_r()));
168 gk20a_writel(g, gr_fecs_icd_cmd_r(),
169 gr_fecs_icd_cmd_opc_rreg_f() |
170 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_SP));
171 gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_SP : 0x%x",
172 gk20a_readl(g, gr_fecs_icd_rdata_r()));
176 static void gr_gk20a_load_falcon_dmem(struct gk20a *g)
178 u32 i, ucode_u32_size;
179 const u32 *ucode_u32_data;
184 gk20a_writel(g, gr_gpccs_dmemc_r(0), (gr_gpccs_dmemc_offs_f(0) |
185 gr_gpccs_dmemc_blk_f(0) |
186 gr_gpccs_dmemc_aincw_f(1)));
188 ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.data.count;
189 ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.data.l;
191 for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
192 gk20a_writel(g, gr_gpccs_dmemd_r(0), ucode_u32_data[i]);
193 checksum += ucode_u32_data[i];
196 gk20a_writel(g, gr_fecs_dmemc_r(0), (gr_fecs_dmemc_offs_f(0) |
197 gr_fecs_dmemc_blk_f(0) |
198 gr_fecs_dmemc_aincw_f(1)));
200 ucode_u32_size = g->gr.ctx_vars.ucode.fecs.data.count;
201 ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.data.l;
203 for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
204 gk20a_writel(g, gr_fecs_dmemd_r(0), ucode_u32_data[i]);
205 checksum += ucode_u32_data[i];
207 gk20a_dbg_fn("done");
210 static void gr_gk20a_load_falcon_imem(struct gk20a *g)
212 u32 cfg, fecs_imem_size, gpccs_imem_size, ucode_u32_size;
213 const u32 *ucode_u32_data;
214 u32 tag, i, pad_start, pad_end;
219 cfg = gk20a_readl(g, gr_fecs_cfg_r());
220 fecs_imem_size = gr_fecs_cfg_imem_sz_v(cfg);
222 cfg = gk20a_readl(g, gr_gpc0_cfg_r());
223 gpccs_imem_size = gr_gpc0_cfg_imem_sz_v(cfg);
225 /* Use the broadcast address to access all of the GPCCS units. */
226 gk20a_writel(g, gr_gpccs_imemc_r(0), (gr_gpccs_imemc_offs_f(0) |
227 gr_gpccs_imemc_blk_f(0) |
228 gr_gpccs_imemc_aincw_f(1)));
230 /* Setup the tags for the instruction memory. */
232 gk20a_writel(g, gr_gpccs_imemt_r(0), gr_gpccs_imemt_tag_f(tag));
234 ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.inst.count;
235 ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.inst.l;
237 for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
238 if (i && ((i % (256/sizeof(u32))) == 0)) {
240 gk20a_writel(g, gr_gpccs_imemt_r(0),
241 gr_gpccs_imemt_tag_f(tag));
243 gk20a_writel(g, gr_gpccs_imemd_r(0), ucode_u32_data[i]);
244 checksum += ucode_u32_data[i];
248 pad_end = pad_start+(256-pad_start%256)+256;
250 (i < gpccs_imem_size * 256) && (i < pad_end);
252 if (i && ((i % 256) == 0)) {
254 gk20a_writel(g, gr_gpccs_imemt_r(0),
255 gr_gpccs_imemt_tag_f(tag));
257 gk20a_writel(g, gr_gpccs_imemd_r(0), 0);
260 gk20a_writel(g, gr_fecs_imemc_r(0), (gr_fecs_imemc_offs_f(0) |
261 gr_fecs_imemc_blk_f(0) |
262 gr_fecs_imemc_aincw_f(1)));
264 /* Setup the tags for the instruction memory. */
266 gk20a_writel(g, gr_fecs_imemt_r(0), gr_fecs_imemt_tag_f(tag));
268 ucode_u32_size = g->gr.ctx_vars.ucode.fecs.inst.count;
269 ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.inst.l;
271 for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
272 if (i && ((i % (256/sizeof(u32))) == 0)) {
274 gk20a_writel(g, gr_fecs_imemt_r(0),
275 gr_fecs_imemt_tag_f(tag));
277 gk20a_writel(g, gr_fecs_imemd_r(0), ucode_u32_data[i]);
278 checksum += ucode_u32_data[i];
282 pad_end = pad_start+(256-pad_start%256)+256;
283 for (i = pad_start; (i < fecs_imem_size * 256) && i < pad_end; i += 4) {
284 if (i && ((i % 256) == 0)) {
286 gk20a_writel(g, gr_fecs_imemt_r(0),
287 gr_fecs_imemt_tag_f(tag));
289 gk20a_writel(g, gr_fecs_imemd_r(0), 0);
293 int gr_gk20a_wait_idle(struct gk20a *g, unsigned long end_jiffies,
296 u32 delay = expect_delay;
304 /* fmodel: host gets fifo_engine_status(gr) from gr
305 only when gr_status is read */
306 gk20a_readl(g, gr_status_r());
308 gr_enabled = gk20a_readl(g, mc_enable_r()) &
309 mc_enable_pgraph_enabled_f();
311 ctxsw_active = gk20a_readl(g,
312 fifo_engine_status_r(ENGINE_GR_GK20A)) &
313 fifo_engine_status_ctxsw_in_progress_f();
315 gr_busy = gk20a_readl(g, gr_engine_status_r()) &
316 gr_engine_status_value_busy_f();
318 if (!gr_enabled || (!gr_busy && !ctxsw_active)) {
319 gk20a_dbg_fn("done");
323 usleep_range(delay, delay * 2);
324 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
326 } while (time_before(jiffies, end_jiffies)
327 || !tegra_platform_is_silicon());
329 gk20a_err(dev_from_gk20a(g),
330 "timeout, ctxsw busy : %d, gr busy : %d",
331 ctxsw_active, gr_busy);
336 static int gr_gk20a_wait_fe_idle(struct gk20a *g, unsigned long end_jiffies,
340 u32 delay = expect_delay;
342 if (tegra_platform_is_linsim())
348 val = gk20a_readl(g, gr_status_r());
350 if (!gr_status_fe_method_lower_v(val)) {
351 gk20a_dbg_fn("done");
355 usleep_range(delay, delay * 2);
356 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
357 } while (time_before(jiffies, end_jiffies)
358 || !tegra_platform_is_silicon());
360 gk20a_err(dev_from_gk20a(g),
361 "timeout, fe busy : %x", val);
366 static int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id,
367 u32 *mailbox_ret, u32 opc_success,
368 u32 mailbox_ok, u32 opc_fail,
369 u32 mailbox_fail, bool sleepduringwait)
371 unsigned long end_jiffies = jiffies +
372 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
373 u32 delay = GR_FECS_POLL_INTERVAL;
374 u32 check = WAIT_UCODE_LOOP;
380 delay = GR_IDLE_CHECK_DEFAULT;
382 while (check == WAIT_UCODE_LOOP) {
383 if (!time_before(jiffies, end_jiffies) &&
384 tegra_platform_is_silicon())
385 check = WAIT_UCODE_TIMEOUT;
387 reg = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(mailbox_id));
392 switch (opc_success) {
393 case GR_IS_UCODE_OP_EQUAL:
394 if (reg == mailbox_ok)
395 check = WAIT_UCODE_OK;
397 case GR_IS_UCODE_OP_NOT_EQUAL:
398 if (reg != mailbox_ok)
399 check = WAIT_UCODE_OK;
401 case GR_IS_UCODE_OP_AND:
402 if (reg & mailbox_ok)
403 check = WAIT_UCODE_OK;
405 case GR_IS_UCODE_OP_LESSER:
406 if (reg < mailbox_ok)
407 check = WAIT_UCODE_OK;
409 case GR_IS_UCODE_OP_LESSER_EQUAL:
410 if (reg <= mailbox_ok)
411 check = WAIT_UCODE_OK;
413 case GR_IS_UCODE_OP_SKIP:
414 /* do no success check */
417 gk20a_err(dev_from_gk20a(g),
418 "invalid success opcode 0x%x", opc_success);
420 check = WAIT_UCODE_ERROR;
425 case GR_IS_UCODE_OP_EQUAL:
426 if (reg == mailbox_fail)
427 check = WAIT_UCODE_ERROR;
429 case GR_IS_UCODE_OP_NOT_EQUAL:
430 if (reg != mailbox_fail)
431 check = WAIT_UCODE_ERROR;
433 case GR_IS_UCODE_OP_AND:
434 if (reg & mailbox_fail)
435 check = WAIT_UCODE_ERROR;
437 case GR_IS_UCODE_OP_LESSER:
438 if (reg < mailbox_fail)
439 check = WAIT_UCODE_ERROR;
441 case GR_IS_UCODE_OP_LESSER_EQUAL:
442 if (reg <= mailbox_fail)
443 check = WAIT_UCODE_ERROR;
445 case GR_IS_UCODE_OP_SKIP:
446 /* do no check on fail*/
449 gk20a_err(dev_from_gk20a(g),
450 "invalid fail opcode 0x%x", opc_fail);
451 check = WAIT_UCODE_ERROR;
455 if (sleepduringwait) {
456 usleep_range(delay, delay * 2);
457 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
462 if (check == WAIT_UCODE_TIMEOUT) {
463 gk20a_err(dev_from_gk20a(g),
464 "timeout waiting on ucode response");
465 gk20a_fecs_dump_falcon_stats(g);
466 gk20a_gr_debug_dump(g->dev);
468 } else if (check == WAIT_UCODE_ERROR) {
469 gk20a_err(dev_from_gk20a(g),
470 "ucode method failed on mailbox=%d value=0x%08x",
472 gk20a_fecs_dump_falcon_stats(g);
476 gk20a_dbg_fn("done");
480 /* The following is a less brittle way to call gr_gk20a_submit_fecs_method(...)
481 * We should replace most, if not all, fecs method calls to this instead. */
482 int gr_gk20a_submit_fecs_method_op(struct gk20a *g,
483 struct fecs_method_op_gk20a op,
484 bool sleepduringwait)
486 struct gr_gk20a *gr = &g->gr;
489 mutex_lock(&gr->fecs_mutex);
491 if (op.mailbox.id != 0)
492 gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(op.mailbox.id),
495 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
496 gr_fecs_ctxsw_mailbox_clear_value_f(op.mailbox.clr));
498 gk20a_writel(g, gr_fecs_method_data_r(), op.method.data);
499 gk20a_writel(g, gr_fecs_method_push_r(),
500 gr_fecs_method_push_adr_f(op.method.addr));
502 /* op.mb.id == 4 cases require waiting for completion on
503 * for op.mb.id == 0 */
504 if (op.mailbox.id == 4)
507 ret = gr_gk20a_ctx_wait_ucode(g, op.mailbox.id, op.mailbox.ret,
508 op.cond.ok, op.mailbox.ok,
509 op.cond.fail, op.mailbox.fail,
512 mutex_unlock(&gr->fecs_mutex);
517 int gr_gk20a_submit_fecs_method_wfi(struct gk20a *g)
521 cur_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
523 if (gr_fecs_current_ctx_valid_v(cur_ctx))
524 gr_gk20a_submit_fecs_method_op(g,
525 (struct fecs_method_op_gk20a) {
529 .method.data = cur_ctx,
530 .method.addr = gr_fecs_method_push_adr_wfi_v(),
532 .cond.ok = GR_IS_UCODE_OP_EQUAL,
534 .cond.fail = GR_IS_UCODE_OP_EQUAL,
535 .mailbox.fail = 0xA}, false);
540 static int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret)
542 return gr_gk20a_submit_fecs_method_op(g,
543 (struct fecs_method_op_gk20a) {
544 .method.addr = fecs_method,
546 .mailbox = { .id = 1, /*sideband?*/
547 .data = ~0, .clr = ~0, .ret = ret,
548 .ok = gr_fecs_ctxsw_mailbox_value_pass_v(),
549 .fail = gr_fecs_ctxsw_mailbox_value_fail_v(), },
550 .cond.ok = GR_IS_UCODE_OP_EQUAL,
551 .cond.fail = GR_IS_UCODE_OP_EQUAL }, true);
554 /* Stop processing (stall) context switches at FECS.
555 * The caller must hold the dbg_sessions_lock, else if mutliple stop methods
556 * are sent to the ucode in sequence, it can get into an undefined state. */
557 int gr_gk20a_disable_ctxsw(struct gk20a *g)
559 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
560 return gr_gk20a_ctrl_ctxsw(g,
561 gr_fecs_method_push_adr_stop_ctxsw_v(), NULL);
564 /* Start processing (continue) context switches at FECS */
565 int gr_gk20a_enable_ctxsw(struct gk20a *g)
567 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
568 return gr_gk20a_ctrl_ctxsw(g,
569 gr_fecs_method_push_adr_start_ctxsw_v(), NULL);
572 int gr_gk20a_halt_pipe(struct gk20a *g)
574 return gr_gk20a_submit_fecs_method_op(g,
575 (struct fecs_method_op_gk20a) {
577 gr_fecs_method_push_adr_halt_pipeline_v(),
579 .mailbox = { .id = 1, /*sideband?*/
580 .data = ~0, .clr = ~0, .ret = NULL,
581 .ok = gr_fecs_ctxsw_mailbox_value_pass_v(),
582 .fail = gr_fecs_ctxsw_mailbox_value_fail_v(), },
583 .cond.ok = GR_IS_UCODE_OP_EQUAL,
584 .cond.fail = GR_IS_UCODE_OP_EQUAL }, false);
588 static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
592 void *inst_ptr = NULL;
596 inst_ptr = c->inst_block.cpu_va;
600 addr_lo = u64_lo32(gpu_va) >> 12;
601 addr_hi = u64_hi32(gpu_va);
603 gk20a_mem_wr32(inst_ptr, ram_in_gr_wfi_target_w(),
604 ram_in_gr_cs_wfi_f() | ram_in_gr_wfi_mode_virtual_f() |
605 ram_in_gr_wfi_ptr_lo_f(addr_lo));
607 gk20a_mem_wr32(inst_ptr, ram_in_gr_wfi_ptr_hi_w(),
608 ram_in_gr_wfi_ptr_hi_f(addr_hi));
614 * Context state can be written directly or "patched" at times.
615 * So that code can be used in either situation it is written
616 * using a series _ctx_patch_write(..., patch) statements.
617 * However any necessary cpu map/unmap and gpu l2 invalidates
618 * should be minimized (to avoid doing it once per patch write).
619 * Before a sequence of these set up with "_ctx_patch_write_begin"
620 * and close with "_ctx_patch_write_end."
622 int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
623 struct channel_ctx_gk20a *ch_ctx)
625 /* being defensive still... */
626 if (WARN_ON(ch_ctx->patch_ctx.mem.cpu_va)) {
627 gk20a_err(dev_from_gk20a(g), "nested ctx patch begin?");
631 ch_ctx->patch_ctx.mem.cpu_va = vmap(ch_ctx->patch_ctx.mem.pages,
632 PAGE_ALIGN(ch_ctx->patch_ctx.mem.size) >> PAGE_SHIFT,
633 0, pgprot_dmacoherent(PAGE_KERNEL));
635 if (!ch_ctx->patch_ctx.mem.cpu_va)
641 int gr_gk20a_ctx_patch_write_end(struct gk20a *g,
642 struct channel_ctx_gk20a *ch_ctx)
644 /* being defensive still... */
645 if (!ch_ctx->patch_ctx.mem.cpu_va) {
646 gk20a_err(dev_from_gk20a(g), "dangling ctx patch end?");
650 vunmap(ch_ctx->patch_ctx.mem.cpu_va);
651 ch_ctx->patch_ctx.mem.cpu_va = NULL;
655 int gr_gk20a_ctx_patch_write(struct gk20a *g,
656 struct channel_ctx_gk20a *ch_ctx,
657 u32 addr, u32 data, bool patch)
660 void *patch_ptr = NULL;
661 bool mapped_here = false;
663 BUG_ON(patch != 0 && ch_ctx == NULL);
668 /* we added an optimization prolog, epilog
669 * to get rid of unnecessary maps and l2 invals.
670 * but be defensive still... */
671 if (!ch_ctx->patch_ctx.mem.cpu_va) {
673 gk20a_dbg_info("per-write ctx patch begin?");
674 /* yes, gr_gk20a_ctx_patch_smpc causes this one */
675 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
682 patch_ptr = ch_ctx->patch_ctx.mem.cpu_va;
683 patch_slot = ch_ctx->patch_ctx.data_count * 2;
685 gk20a_mem_wr32(patch_ptr, patch_slot++, addr);
686 gk20a_mem_wr32(patch_ptr, patch_slot++, data);
688 ch_ctx->patch_ctx.data_count++;
691 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
694 gk20a_writel(g, addr, data);
699 static int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
700 struct channel_gk20a *c)
702 u32 inst_base_ptr = u64_lo32(gk20a_mem_phys(&c->inst_block)
703 >> ram_in_base_shift_v());
706 gk20a_dbg_info("bind channel %d inst ptr 0x%08x",
707 c->hw_chid, inst_base_ptr);
709 ret = gr_gk20a_submit_fecs_method_op(g,
710 (struct fecs_method_op_gk20a) {
711 .method.addr = gr_fecs_method_push_adr_bind_pointer_v(),
712 .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
713 gr_fecs_current_ctx_target_vid_mem_f() |
714 gr_fecs_current_ctx_valid_f(1)),
715 .mailbox = { .id = 0, .data = 0,
720 .cond.ok = GR_IS_UCODE_OP_AND,
721 .cond.fail = GR_IS_UCODE_OP_AND}, true);
723 gk20a_err(dev_from_gk20a(g),
724 "bind channel instance failed");
729 static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c,
732 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
733 struct fifo_gk20a *f = &g->fifo;
734 struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
735 u32 va_lo, va_hi, va;
737 void *ctx_ptr = NULL;
741 ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
742 PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
743 0, pgprot_dmacoherent(PAGE_KERNEL));
747 if (ch_ctx->zcull_ctx.gpu_va == 0 &&
748 ch_ctx->zcull_ctx.ctx_sw_mode ==
749 ctxsw_prog_main_image_zcull_mode_separate_buffer_v()) {
754 va_lo = u64_lo32(ch_ctx->zcull_ctx.gpu_va);
755 va_hi = u64_hi32(ch_ctx->zcull_ctx.gpu_va);
756 va = ((va_lo >> 8) & 0x00FFFFFF) | ((va_hi << 24) & 0xFF000000);
759 ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
761 gk20a_err(dev_from_gk20a(g),
762 "failed to disable gr engine activity\n");
767 g->ops.mm.fb_flush(g);
769 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_o(), 0,
770 ch_ctx->zcull_ctx.ctx_sw_mode);
772 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, va);
775 ret = gk20a_fifo_enable_engine_activity(g, gr_info);
777 gk20a_err(dev_from_gk20a(g),
778 "failed to enable gr engine activity\n");
789 static int gr_gk20a_commit_global_cb_manager(struct gk20a *g,
790 struct channel_gk20a *c, bool patch)
792 struct gr_gk20a *gr = &g->gr;
793 struct channel_ctx_gk20a *ch_ctx = NULL;
794 u32 attrib_offset_in_chunk = 0;
795 u32 alpha_offset_in_chunk = 0;
796 u32 pd_ab_max_output;
797 u32 gpc_index, ppc_index;
799 u32 cbm_cfg_size1, cbm_cfg_size2;
806 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
811 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_tga_constraintlogic_r(),
812 gr_ds_tga_constraintlogic_beta_cbsize_f(gr->attrib_cb_default_size) |
813 gr_ds_tga_constraintlogic_alpha_cbsize_f(gr->alpha_cb_default_size),
816 pd_ab_max_output = (gr->alpha_cb_default_size *
817 gr_gpc0_ppc0_cbm_cfg_size_granularity_v()) /
818 gr_pd_ab_dist_cfg1_max_output_granularity_v();
820 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg1_r(),
821 gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output) |
822 gr_pd_ab_dist_cfg1_max_batches_init_f(), patch);
824 alpha_offset_in_chunk = attrib_offset_in_chunk +
825 gr->tpc_count * gr->attrib_cb_size;
827 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
828 temp = proj_gpc_stride_v() * gpc_index;
829 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
831 cbm_cfg_size1 = gr->attrib_cb_default_size *
832 gr->pes_tpc_count[ppc_index][gpc_index];
833 cbm_cfg_size2 = gr->alpha_cb_default_size *
834 gr->pes_tpc_count[ppc_index][gpc_index];
836 gr_gk20a_ctx_patch_write(g, ch_ctx,
837 gr_gpc0_ppc0_cbm_cfg_r() + temp +
838 proj_ppc_in_gpc_stride_v() * ppc_index,
839 gr_gpc0_ppc0_cbm_cfg_timeslice_mode_f(gr->timeslice_mode) |
840 gr_gpc0_ppc0_cbm_cfg_start_offset_f(attrib_offset_in_chunk) |
841 gr_gpc0_ppc0_cbm_cfg_size_f(cbm_cfg_size1), patch);
843 attrib_offset_in_chunk += gr->attrib_cb_size *
844 gr->pes_tpc_count[ppc_index][gpc_index];
846 gr_gk20a_ctx_patch_write(g, ch_ctx,
847 gr_gpc0_ppc0_cbm_cfg2_r() + temp +
848 proj_ppc_in_gpc_stride_v() * ppc_index,
849 gr_gpc0_ppc0_cbm_cfg2_start_offset_f(alpha_offset_in_chunk) |
850 gr_gpc0_ppc0_cbm_cfg2_size_f(cbm_cfg_size2), patch);
852 alpha_offset_in_chunk += gr->alpha_cb_size *
853 gr->pes_tpc_count[ppc_index][gpc_index];
858 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
863 static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
864 struct channel_gk20a *c, bool patch)
866 struct gr_gk20a *gr = &g->gr;
867 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
874 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
879 /* global pagepool buffer */
880 addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) >>
881 gr_scc_pagepool_base_addr_39_8_align_bits_v()) |
882 (u64_hi32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) <<
883 (32 - gr_scc_pagepool_base_addr_39_8_align_bits_v()));
885 size = gr->global_ctx_buffer[PAGEPOOL].mem.size /
886 gr_scc_pagepool_total_pages_byte_granularity_v();
888 if (size == g->ops.gr.pagepool_default_size(g))
889 size = gr_scc_pagepool_total_pages_hwmax_v();
891 gk20a_dbg_info("pagepool buffer addr : 0x%016llx, size : %d",
894 g->ops.gr.commit_global_pagepool(g, ch_ctx, addr, size, patch);
896 /* global bundle cb */
897 addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) >>
898 gr_scc_bundle_cb_base_addr_39_8_align_bits_v()) |
899 (u64_hi32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) <<
900 (32 - gr_scc_bundle_cb_base_addr_39_8_align_bits_v()));
902 size = gr->bundle_cb_default_size;
904 gk20a_dbg_info("bundle cb addr : 0x%016llx, size : %d",
907 g->ops.gr.commit_global_bundle_cb(g, ch_ctx, addr, size, patch);
909 /* global attrib cb */
910 addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) >>
911 gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()) |
912 (u64_hi32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) <<
913 (32 - gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()));
915 gk20a_dbg_info("attrib cb addr : 0x%016llx", addr);
916 g->ops.gr.commit_global_attrib_cb(g, ch_ctx, addr, patch);
919 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
924 static void gr_gk20a_commit_global_attrib_cb(struct gk20a *g,
925 struct channel_ctx_gk20a *ch_ctx,
926 u64 addr, bool patch)
928 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_attrib_cb_base_r(),
929 gr_gpcs_setup_attrib_cb_base_addr_39_12_f(addr) |
930 gr_gpcs_setup_attrib_cb_base_valid_true_f(), patch);
932 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_pin_cb_global_base_addr_r(),
933 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_v_f(addr) |
934 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_valid_true_f(), patch);
937 static void gr_gk20a_commit_global_bundle_cb(struct gk20a *g,
938 struct channel_ctx_gk20a *ch_ctx,
939 u64 addr, u64 size, bool patch)
943 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_base_r(),
944 gr_scc_bundle_cb_base_addr_39_8_f(addr), patch);
946 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_size_r(),
947 gr_scc_bundle_cb_size_div_256b_f(size) |
948 gr_scc_bundle_cb_size_valid_true_f(), patch);
950 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_base_r(),
951 gr_gpcs_setup_bundle_cb_base_addr_39_8_f(addr), patch);
953 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_size_r(),
954 gr_gpcs_setup_bundle_cb_size_div_256b_f(size) |
955 gr_gpcs_setup_bundle_cb_size_valid_true_f(), patch);
957 /* data for state_limit */
958 data = (g->gr.bundle_cb_default_size *
959 gr_scc_bundle_cb_size_div_256b_byte_granularity_v()) /
960 gr_pd_ab_dist_cfg2_state_limit_scc_bundle_granularity_v();
962 data = min_t(u32, data, g->gr.min_gpm_fifo_depth);
964 gk20a_dbg_info("bundle cb token limit : %d, state limit : %d",
965 g->gr.bundle_cb_token_limit, data);
967 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg2_r(),
968 gr_pd_ab_dist_cfg2_token_limit_f(g->gr.bundle_cb_token_limit) |
969 gr_pd_ab_dist_cfg2_state_limit_f(data), patch);
973 static int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c, bool patch)
975 struct gr_gk20a *gr = &g->gr;
976 struct channel_ctx_gk20a *ch_ctx = NULL;
986 gpm_pd_cfg = gk20a_readl(g, gr_gpcs_gpm_pd_cfg_r());
987 pd_ab_dist_cfg0 = gk20a_readl(g, gr_pd_ab_dist_cfg0_r());
988 ds_debug = gk20a_readl(g, gr_ds_debug_r());
989 mpc_vtg_debug = gk20a_readl(g, gr_gpcs_tpcs_mpc_vtg_debug_r());
994 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
999 if (gr->timeslice_mode == gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v()) {
1000 pe_vaf = gk20a_readl(g, gr_gpcs_tpcs_pe_vaf_r());
1001 pe_vsc_vpc = gk20a_readl(g, gr_gpcs_tpcs_pes_vsc_vpc_r());
1003 gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_enable_f() | gpm_pd_cfg;
1004 pe_vaf = gr_gpcs_tpcs_pe_vaf_fast_mode_switch_true_f() | pe_vaf;
1005 pe_vsc_vpc = gr_gpcs_tpcs_pes_vsc_vpc_fast_mode_switch_true_f() | pe_vsc_vpc;
1006 pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_en_f() | pd_ab_dist_cfg0;
1007 ds_debug = gr_ds_debug_timeslice_mode_enable_f() | ds_debug;
1008 mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_enabled_f() | mpc_vtg_debug;
1010 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
1011 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_vaf_r(), pe_vaf, patch);
1012 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pes_vsc_vpc_r(), pe_vsc_vpc, patch);
1013 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
1014 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
1015 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
1017 gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_disable_f() | gpm_pd_cfg;
1018 pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_dis_f() | pd_ab_dist_cfg0;
1019 ds_debug = gr_ds_debug_timeslice_mode_disable_f() | ds_debug;
1020 mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_disabled_f() | mpc_vtg_debug;
1022 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
1023 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
1024 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
1025 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
1029 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
1034 int gr_gk20a_setup_rop_mapping(struct gk20a *g, struct gr_gk20a *gr)
1036 u32 norm_entries, norm_shift;
1037 u32 coeff5_mod, coeff6_mod, coeff7_mod, coeff8_mod, coeff9_mod, coeff10_mod, coeff11_mod;
1038 u32 map0, map1, map2, map3, map4, map5;
1045 gk20a_writel(g, gr_crstr_map_table_cfg_r(),
1046 gr_crstr_map_table_cfg_row_offset_f(gr->map_row_offset) |
1047 gr_crstr_map_table_cfg_num_entries_f(gr->tpc_count));
1049 map0 = gr_crstr_gpc_map0_tile0_f(gr->map_tiles[0]) |
1050 gr_crstr_gpc_map0_tile1_f(gr->map_tiles[1]) |
1051 gr_crstr_gpc_map0_tile2_f(gr->map_tiles[2]) |
1052 gr_crstr_gpc_map0_tile3_f(gr->map_tiles[3]) |
1053 gr_crstr_gpc_map0_tile4_f(gr->map_tiles[4]) |
1054 gr_crstr_gpc_map0_tile5_f(gr->map_tiles[5]);
1056 map1 = gr_crstr_gpc_map1_tile6_f(gr->map_tiles[6]) |
1057 gr_crstr_gpc_map1_tile7_f(gr->map_tiles[7]) |
1058 gr_crstr_gpc_map1_tile8_f(gr->map_tiles[8]) |
1059 gr_crstr_gpc_map1_tile9_f(gr->map_tiles[9]) |
1060 gr_crstr_gpc_map1_tile10_f(gr->map_tiles[10]) |
1061 gr_crstr_gpc_map1_tile11_f(gr->map_tiles[11]);
1063 map2 = gr_crstr_gpc_map2_tile12_f(gr->map_tiles[12]) |
1064 gr_crstr_gpc_map2_tile13_f(gr->map_tiles[13]) |
1065 gr_crstr_gpc_map2_tile14_f(gr->map_tiles[14]) |
1066 gr_crstr_gpc_map2_tile15_f(gr->map_tiles[15]) |
1067 gr_crstr_gpc_map2_tile16_f(gr->map_tiles[16]) |
1068 gr_crstr_gpc_map2_tile17_f(gr->map_tiles[17]);
1070 map3 = gr_crstr_gpc_map3_tile18_f(gr->map_tiles[18]) |
1071 gr_crstr_gpc_map3_tile19_f(gr->map_tiles[19]) |
1072 gr_crstr_gpc_map3_tile20_f(gr->map_tiles[20]) |
1073 gr_crstr_gpc_map3_tile21_f(gr->map_tiles[21]) |
1074 gr_crstr_gpc_map3_tile22_f(gr->map_tiles[22]) |
1075 gr_crstr_gpc_map3_tile23_f(gr->map_tiles[23]);
1077 map4 = gr_crstr_gpc_map4_tile24_f(gr->map_tiles[24]) |
1078 gr_crstr_gpc_map4_tile25_f(gr->map_tiles[25]) |
1079 gr_crstr_gpc_map4_tile26_f(gr->map_tiles[26]) |
1080 gr_crstr_gpc_map4_tile27_f(gr->map_tiles[27]) |
1081 gr_crstr_gpc_map4_tile28_f(gr->map_tiles[28]) |
1082 gr_crstr_gpc_map4_tile29_f(gr->map_tiles[29]);
1084 map5 = gr_crstr_gpc_map5_tile30_f(gr->map_tiles[30]) |
1085 gr_crstr_gpc_map5_tile31_f(gr->map_tiles[31]) |
1086 gr_crstr_gpc_map5_tile32_f(0) |
1087 gr_crstr_gpc_map5_tile33_f(0) |
1088 gr_crstr_gpc_map5_tile34_f(0) |
1089 gr_crstr_gpc_map5_tile35_f(0);
1091 gk20a_writel(g, gr_crstr_gpc_map0_r(), map0);
1092 gk20a_writel(g, gr_crstr_gpc_map1_r(), map1);
1093 gk20a_writel(g, gr_crstr_gpc_map2_r(), map2);
1094 gk20a_writel(g, gr_crstr_gpc_map3_r(), map3);
1095 gk20a_writel(g, gr_crstr_gpc_map4_r(), map4);
1096 gk20a_writel(g, gr_crstr_gpc_map5_r(), map5);
1098 switch (gr->tpc_count) {
1127 norm_entries = gr->tpc_count << norm_shift;
1128 coeff5_mod = (1 << 5) % norm_entries;
1129 coeff6_mod = (1 << 6) % norm_entries;
1130 coeff7_mod = (1 << 7) % norm_entries;
1131 coeff8_mod = (1 << 8) % norm_entries;
1132 coeff9_mod = (1 << 9) % norm_entries;
1133 coeff10_mod = (1 << 10) % norm_entries;
1134 coeff11_mod = (1 << 11) % norm_entries;
1136 gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg_r(),
1137 gr_ppcs_wwdx_map_table_cfg_row_offset_f(gr->map_row_offset) |
1138 gr_ppcs_wwdx_map_table_cfg_normalized_num_entries_f(norm_entries) |
1139 gr_ppcs_wwdx_map_table_cfg_normalized_shift_value_f(norm_shift) |
1140 gr_ppcs_wwdx_map_table_cfg_coeff5_mod_value_f(coeff5_mod) |
1141 gr_ppcs_wwdx_map_table_cfg_num_entries_f(gr->tpc_count));
1143 gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg2_r(),
1144 gr_ppcs_wwdx_map_table_cfg2_coeff6_mod_value_f(coeff6_mod) |
1145 gr_ppcs_wwdx_map_table_cfg2_coeff7_mod_value_f(coeff7_mod) |
1146 gr_ppcs_wwdx_map_table_cfg2_coeff8_mod_value_f(coeff8_mod) |
1147 gr_ppcs_wwdx_map_table_cfg2_coeff9_mod_value_f(coeff9_mod) |
1148 gr_ppcs_wwdx_map_table_cfg2_coeff10_mod_value_f(coeff10_mod) |
1149 gr_ppcs_wwdx_map_table_cfg2_coeff11_mod_value_f(coeff11_mod));
1151 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map0_r(), map0);
1152 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map1_r(), map1);
1153 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map2_r(), map2);
1154 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map3_r(), map3);
1155 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map4_r(), map4);
1156 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map5_r(), map5);
1158 gk20a_writel(g, gr_rstr2d_map_table_cfg_r(),
1159 gr_rstr2d_map_table_cfg_row_offset_f(gr->map_row_offset) |
1160 gr_rstr2d_map_table_cfg_num_entries_f(gr->tpc_count));
1162 gk20a_writel(g, gr_rstr2d_gpc_map0_r(), map0);
1163 gk20a_writel(g, gr_rstr2d_gpc_map1_r(), map1);
1164 gk20a_writel(g, gr_rstr2d_gpc_map2_r(), map2);
1165 gk20a_writel(g, gr_rstr2d_gpc_map3_r(), map3);
1166 gk20a_writel(g, gr_rstr2d_gpc_map4_r(), map4);
1167 gk20a_writel(g, gr_rstr2d_gpc_map5_r(), map5);
1172 static inline u32 count_bits(u32 mask)
1176 for (count = 0; temp != 0; count++)
1182 static inline u32 clear_count_bits(u32 num, u32 clear_count)
1184 u32 count = clear_count;
1185 for (; (num != 0) && (count != 0); count--)
1191 static int gr_gk20a_setup_alpha_beta_tables(struct gk20a *g,
1192 struct gr_gk20a *gr)
1194 u32 table_index_bits = 5;
1195 u32 rows = (1 << table_index_bits);
1196 u32 row_stride = gr_pd_alpha_ratio_table__size_1_v() / rows;
1201 u32 gpcs_per_reg = 4;
1204 u32 num_pes_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
1206 u32 alpha_target, beta_target;
1207 u32 alpha_bits, beta_bits;
1208 u32 alpha_mask, beta_mask, partial_mask;
1212 u32 map_alpha[gr_pd_alpha_ratio_table__size_1_v()];
1213 u32 map_beta[gr_pd_alpha_ratio_table__size_1_v()];
1214 u32 map_reg_used[gr_pd_alpha_ratio_table__size_1_v()];
1218 memset(map_alpha, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1219 memset(map_beta, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1220 memset(map_reg_used, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1222 for (row = 0; row < rows; ++row) {
1223 alpha_target = max_t(u32, gr->tpc_count * row / rows, 1);
1224 beta_target = gr->tpc_count - alpha_target;
1226 assign_alpha = (alpha_target < beta_target);
1228 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
1229 reg_offset = (row * row_stride) + (gpc_index / gpcs_per_reg);
1230 alpha_mask = beta_mask = 0;
1232 for (pes_index = 0; pes_index < num_pes_per_gpc; pes_index++) {
1233 tpc_count_pes = gr->pes_tpc_count[pes_index][gpc_index];
1236 alpha_bits = (alpha_target == 0) ? 0 : tpc_count_pes;
1237 beta_bits = tpc_count_pes - alpha_bits;
1239 beta_bits = (beta_target == 0) ? 0 : tpc_count_pes;
1240 alpha_bits = tpc_count_pes - beta_bits;
1243 partial_mask = gr->pes_tpc_mask[pes_index][gpc_index];
1244 partial_mask = clear_count_bits(partial_mask, tpc_count_pes - alpha_bits);
1245 alpha_mask |= partial_mask;
1247 partial_mask = gr->pes_tpc_mask[pes_index][gpc_index] ^ partial_mask;
1248 beta_mask |= partial_mask;
1250 alpha_target -= min(alpha_bits, alpha_target);
1251 beta_target -= min(beta_bits, beta_target);
1253 if ((alpha_bits > 0) || (beta_bits > 0))
1254 assign_alpha = !assign_alpha;
1257 switch (gpc_index % gpcs_per_reg) {
1259 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n0_mask_f(alpha_mask);
1260 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n0_mask_f(beta_mask);
1263 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n1_mask_f(alpha_mask);
1264 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n1_mask_f(beta_mask);
1267 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n2_mask_f(alpha_mask);
1268 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n2_mask_f(beta_mask);
1271 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n3_mask_f(alpha_mask);
1272 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n3_mask_f(beta_mask);
1275 map_reg_used[reg_offset] = true;
1279 for (index = 0; index < gr_pd_alpha_ratio_table__size_1_v(); index++) {
1280 if (map_reg_used[index]) {
1281 gk20a_writel(g, gr_pd_alpha_ratio_table_r(index), map_alpha[index]);
1282 gk20a_writel(g, gr_pd_beta_ratio_table_r(index), map_beta[index]);
1289 static u32 gr_gk20a_get_gpc_tpc_mask(struct gk20a *g, u32 gpc_index)
1291 /* One TPC for gk20a */
1295 static int gr_gk20a_ctx_state_floorsweep(struct gk20a *g)
1297 struct gr_gk20a *gr = &g->gr;
1298 u32 tpc_index, gpc_index;
1299 u32 tpc_offset, gpc_offset;
1300 u32 sm_id = 0, gpc_id = 0;
1302 u32 max_ways_evict = INVALID_MAX_WAYS;
1303 u32 l1c_dbg_reg_val;
1307 for (tpc_index = 0; tpc_index < gr->max_tpc_per_gpc_count; tpc_index++) {
1308 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
1309 gpc_offset = proj_gpc_stride_v() * gpc_index;
1310 if (tpc_index < gr->gpc_tpc_count[gpc_index]) {
1311 tpc_offset = proj_tpc_in_gpc_stride_v() * tpc_index;
1313 gk20a_writel(g, gr_gpc0_tpc0_sm_cfg_r() + gpc_offset + tpc_offset,
1314 gr_gpc0_tpc0_sm_cfg_sm_id_f(sm_id));
1315 gk20a_writel(g, gr_gpc0_tpc0_l1c_cfg_smid_r() + gpc_offset + tpc_offset,
1316 gr_gpc0_tpc0_l1c_cfg_smid_value_f(sm_id));
1317 gk20a_writel(g, gr_gpc0_gpm_pd_sm_id_r(tpc_index) + gpc_offset,
1318 gr_gpc0_gpm_pd_sm_id_id_f(sm_id));
1319 gk20a_writel(g, gr_gpc0_tpc0_pe_cfg_smid_r() + gpc_offset + tpc_offset,
1320 gr_gpc0_tpc0_pe_cfg_smid_value_f(sm_id));
1322 g->gr.sm_to_cluster[sm_id].tpc_index = tpc_index;
1323 g->gr.sm_to_cluster[sm_id].gpc_index = gpc_index;
1328 gk20a_writel(g, gr_gpc0_gpm_pd_active_tpcs_r() + gpc_offset,
1329 gr_gpc0_gpm_pd_active_tpcs_num_f(gr->gpc_tpc_count[gpc_index]));
1330 gk20a_writel(g, gr_gpc0_gpm_sd_active_tpcs_r() + gpc_offset,
1331 gr_gpc0_gpm_sd_active_tpcs_num_f(gr->gpc_tpc_count[gpc_index]));
1335 gr->no_of_sm = sm_id;
1337 for (tpc_index = 0, gpc_id = 0;
1338 tpc_index < gr_pd_num_tpc_per_gpc__size_1_v();
1339 tpc_index++, gpc_id += 8) {
1341 if (gpc_id >= gr->gpc_count)
1345 gr_pd_num_tpc_per_gpc_count0_f(gr->gpc_tpc_count[gpc_id + 0]) |
1346 gr_pd_num_tpc_per_gpc_count1_f(gr->gpc_tpc_count[gpc_id + 1]) |
1347 gr_pd_num_tpc_per_gpc_count2_f(gr->gpc_tpc_count[gpc_id + 2]) |
1348 gr_pd_num_tpc_per_gpc_count3_f(gr->gpc_tpc_count[gpc_id + 3]) |
1349 gr_pd_num_tpc_per_gpc_count4_f(gr->gpc_tpc_count[gpc_id + 4]) |
1350 gr_pd_num_tpc_per_gpc_count5_f(gr->gpc_tpc_count[gpc_id + 5]) |
1351 gr_pd_num_tpc_per_gpc_count6_f(gr->gpc_tpc_count[gpc_id + 6]) |
1352 gr_pd_num_tpc_per_gpc_count7_f(gr->gpc_tpc_count[gpc_id + 7]);
1354 gk20a_writel(g, gr_pd_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
1355 gk20a_writel(g, gr_ds_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
1358 /* gr__setup_pd_mapping stubbed for gk20a */
1359 gr_gk20a_setup_rop_mapping(g, gr);
1360 if (g->ops.gr.setup_alpha_beta_tables)
1361 g->ops.gr.setup_alpha_beta_tables(g, gr);
1363 if (gr->num_fbps == 1)
1366 if (max_ways_evict != INVALID_MAX_WAYS)
1367 g->ops.ltc.set_max_ways_evict_last(g, max_ways_evict);
1370 gpc_index < gr_pd_dist_skip_table__size_1_v() * 4;
1373 gk20a_writel(g, gr_pd_dist_skip_table_r(gpc_index/4),
1374 gr_pd_dist_skip_table_gpc_4n0_mask_f(gr->gpc_skip_mask[gpc_index]) ||
1375 gr_pd_dist_skip_table_gpc_4n1_mask_f(gr->gpc_skip_mask[gpc_index + 1]) ||
1376 gr_pd_dist_skip_table_gpc_4n2_mask_f(gr->gpc_skip_mask[gpc_index + 2]) ||
1377 gr_pd_dist_skip_table_gpc_4n3_mask_f(gr->gpc_skip_mask[gpc_index + 3]));
1380 gk20a_writel(g, gr_cwd_fs_r(),
1381 gr_cwd_fs_num_gpcs_f(gr->gpc_count) |
1382 gr_cwd_fs_num_tpcs_f(gr->tpc_count));
1384 gk20a_writel(g, gr_bes_zrop_settings_r(),
1385 gr_bes_zrop_settings_num_active_fbps_f(gr->num_fbps));
1386 gk20a_writel(g, gr_bes_crop_settings_r(),
1387 gr_bes_crop_settings_num_active_fbps_f(gr->num_fbps));
1389 /* turn on cya15 bit for a default val that missed the cut */
1390 l1c_dbg_reg_val = gk20a_readl(g, gr_gpc0_tpc0_l1c_dbg_r());
1391 l1c_dbg_reg_val |= gr_gpc0_tpc0_l1c_dbg_cya15_en_f();
1392 gk20a_writel(g, gr_gpc0_tpc0_l1c_dbg_r(), l1c_dbg_reg_val);
1397 static int gr_gk20a_fecs_ctx_image_save(struct channel_gk20a *c, u32 save_type)
1399 struct gk20a *g = c->g;
1403 u64_lo32(gk20a_mem_phys(&c->inst_block)
1404 >> ram_in_base_shift_v());
1409 ret = gr_gk20a_submit_fecs_method_op(g,
1410 (struct fecs_method_op_gk20a) {
1411 .method.addr = save_type,
1412 .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
1413 gr_fecs_current_ctx_target_vid_mem_f() |
1414 gr_fecs_current_ctx_valid_f(1)),
1415 .mailbox = {.id = 0, .data = 0, .clr = 3, .ret = NULL,
1418 .cond.ok = GR_IS_UCODE_OP_AND,
1419 .cond.fail = GR_IS_UCODE_OP_AND,
1423 gk20a_err(dev_from_gk20a(g), "save context image failed");
1428 static u32 gk20a_init_sw_bundle(struct gk20a *g)
1430 struct av_list_gk20a *sw_bundle_init = &g->gr.ctx_vars.sw_bundle_init;
1431 u32 last_bundle_data = 0;
1434 unsigned long end_jiffies = jiffies +
1435 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
1436 u32 fe_go_idle_timeout_save;
1438 /* save and disable fe_go_idle */
1439 fe_go_idle_timeout_save =
1440 gk20a_readl(g, gr_fe_go_idle_timeout_r());
1441 gk20a_writel(g, gr_fe_go_idle_timeout_r(),
1442 (fe_go_idle_timeout_save & gr_fe_go_idle_timeout_count_f(0)) |
1443 gr_fe_go_idle_timeout_count_disabled_f());
1444 /* enable pipe mode override */
1445 gk20a_writel(g, gr_pipe_bundle_config_r(),
1446 gr_pipe_bundle_config_override_pipe_mode_enabled_f());
1448 /* load bundle init */
1449 for (i = 0; i < sw_bundle_init->count; i++) {
1450 if (i == 0 || last_bundle_data != sw_bundle_init->l[i].value) {
1451 gk20a_writel(g, gr_pipe_bundle_data_r(),
1452 sw_bundle_init->l[i].value);
1453 last_bundle_data = sw_bundle_init->l[i].value;
1456 gk20a_writel(g, gr_pipe_bundle_address_r(),
1457 sw_bundle_init->l[i].addr);
1459 if (gr_pipe_bundle_address_value_v(sw_bundle_init->l[i].addr) ==
1461 err |= gr_gk20a_wait_idle(g, end_jiffies,
1462 GR_IDLE_CHECK_DEFAULT);
1464 err = gr_gk20a_wait_fe_idle(g, end_jiffies,
1465 GR_IDLE_CHECK_DEFAULT);
1470 /* disable pipe mode override */
1471 gk20a_writel(g, gr_pipe_bundle_config_r(),
1472 gr_pipe_bundle_config_override_pipe_mode_disabled_f());
1474 /* restore fe_go_idle */
1475 gk20a_writel(g, gr_fe_go_idle_timeout_r(), fe_go_idle_timeout_save);
1480 /* init global golden image from a fresh gr_ctx in channel ctx.
1481 save a copy in local_golden_image in ctx_vars */
1482 static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
1483 struct channel_gk20a *c)
1485 struct gr_gk20a *gr = &g->gr;
1486 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1487 u32 ctx_header_bytes = ctxsw_prog_fecs_header_v();
1488 u32 ctx_header_words;
1491 void *ctx_ptr = NULL;
1492 void *gold_ptr = NULL;
1497 /* golden ctx is global to all channels. Although only the first
1498 channel initializes golden image, driver needs to prevent multiple
1499 channels from initializing golden ctx at the same time */
1500 mutex_lock(&gr->ctx_mutex);
1502 if (gr->ctx_vars.golden_image_initialized)
1505 err = gr_gk20a_fecs_ctx_bind_channel(g, c);
1509 err = gk20a_init_sw_bundle(g);
1513 err = gr_gk20a_elpg_protected_call(g,
1514 gr_gk20a_commit_global_ctx_buffers(g, c, false));
1518 gold_ptr = vmap(gr->global_ctx_buffer[GOLDEN_CTX].mem.pages,
1519 PAGE_ALIGN(gr->global_ctx_buffer[GOLDEN_CTX].mem.size) >>
1520 PAGE_SHIFT, 0, pgprot_dmacoherent(PAGE_KERNEL));
1524 ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
1525 PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
1526 0, pgprot_dmacoherent(PAGE_KERNEL));
1530 ctx_header_words = roundup(ctx_header_bytes, sizeof(u32));
1531 ctx_header_words >>= 2;
1533 g->ops.mm.l2_flush(g, true);
1535 for (i = 0; i < ctx_header_words; i++) {
1536 data = gk20a_mem_rd32(ctx_ptr, i);
1537 gk20a_mem_wr32(gold_ptr, i, data);
1540 gk20a_mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_o(), 0,
1541 ctxsw_prog_main_image_zcull_mode_no_ctxsw_v());
1543 gk20a_mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, 0);
1545 gr_gk20a_commit_inst(c, ch_ctx->global_ctx_buffer_va[GOLDEN_CTX_VA]);
1547 gr_gk20a_fecs_ctx_image_save(c, gr_fecs_method_push_adr_wfi_golden_save_v());
1549 if (gr->ctx_vars.local_golden_image == NULL) {
1551 gr->ctx_vars.local_golden_image =
1552 kzalloc(gr->ctx_vars.golden_image_size, GFP_KERNEL);
1554 if (gr->ctx_vars.local_golden_image == NULL) {
1559 for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
1560 gr->ctx_vars.local_golden_image[i] =
1561 gk20a_mem_rd32(gold_ptr, i);
1564 gr_gk20a_commit_inst(c, ch_ctx->gr_ctx->mem.gpu_va);
1566 gr->ctx_vars.golden_image_initialized = true;
1568 gk20a_writel(g, gr_fecs_current_ctx_r(),
1569 gr_fecs_current_ctx_valid_false_f());
1573 gk20a_err(dev_from_gk20a(g), "fail");
1575 gk20a_dbg_fn("done");
1582 mutex_unlock(&gr->ctx_mutex);
1586 int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
1587 struct channel_gk20a *c,
1588 bool enable_smpc_ctxsw)
1590 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1591 void *ctx_ptr = NULL;
1595 c->g->ops.fifo.disable_channel(c);
1596 ret = c->g->ops.fifo.preempt_channel(c->g, c->hw_chid);
1598 gk20a_err(dev_from_gk20a(g),
1599 "failed to preempt channel\n");
1603 /* Channel gr_ctx buffer is gpu cacheable.
1604 Flush and invalidate before cpu update. */
1605 g->ops.mm.l2_flush(g, true);
1607 ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
1608 PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
1609 0, pgprot_dmacoherent(PAGE_KERNEL));
1613 data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
1614 data = data & ~ctxsw_prog_main_image_pm_smpc_mode_m();
1615 data |= enable_smpc_ctxsw ?
1616 ctxsw_prog_main_image_pm_smpc_mode_ctxsw_f() :
1617 ctxsw_prog_main_image_pm_smpc_mode_no_ctxsw_f();
1618 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0,
1623 /* enable channel */
1624 gk20a_writel(c->g, ccsr_channel_r(c->hw_chid),
1625 gk20a_readl(c->g, ccsr_channel_r(c->hw_chid)) |
1626 ccsr_channel_enable_set_true_f());
1631 /* load saved fresh copy of gloden image into channel gr_ctx */
1632 int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
1633 struct channel_gk20a *c)
1635 struct gr_gk20a *gr = &g->gr;
1636 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1641 void *ctx_ptr = NULL;
1645 if (gr->ctx_vars.local_golden_image == NULL)
1648 /* Channel gr_ctx buffer is gpu cacheable.
1649 Flush and invalidate before cpu update. */
1650 g->ops.mm.l2_flush(g, true);
1652 ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
1653 PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
1654 0, pgprot_dmacoherent(PAGE_KERNEL));
1658 for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
1659 gk20a_mem_wr32(ctx_ptr, i, gr->ctx_vars.local_golden_image[i]);
1661 if (g->ops.gr.enable_cde_in_fecs && c->cde)
1662 g->ops.gr.enable_cde_in_fecs(ctx_ptr);
1664 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_save_ops_o(), 0, 0);
1665 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_restore_ops_o(), 0, 0);
1667 virt_addr_lo = u64_lo32(ch_ctx->patch_ctx.mem.gpu_va);
1668 virt_addr_hi = u64_hi32(ch_ctx->patch_ctx.mem.gpu_va);
1670 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_count_o(), 0,
1671 ch_ctx->patch_ctx.data_count);
1672 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_lo_o(), 0,
1674 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_hi_o(), 0,
1677 /* no user for client managed performance counter ctx */
1678 data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
1679 data = data & ~ctxsw_prog_main_image_pm_mode_m();
1680 data |= ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
1681 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0,
1684 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_o(), 0, 0);
1686 /* set priv access map */
1688 u64_lo32(ch_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
1690 u64_hi32(ch_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
1693 data = ctxsw_prog_main_image_priv_access_map_config_mode_allow_all_f();
1695 data = ctxsw_prog_main_image_priv_access_map_config_mode_use_map_f();
1697 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_config_o(), 0,
1699 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_lo_o(), 0,
1701 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_hi_o(), 0,
1703 /* disable verif features */
1704 v = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0);
1705 v = v & ~(ctxsw_prog_main_image_misc_options_verif_features_m());
1706 v = v | ctxsw_prog_main_image_misc_options_verif_features_disabled_f();
1707 gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0, v);
1709 if (g->ops.gr.update_ctxsw_preemption_mode)
1710 g->ops.gr.update_ctxsw_preemption_mode(g, ch_ctx, ctx_ptr);
1714 if (tegra_platform_is_linsim()) {
1716 u64_lo32(gk20a_mem_phys(&c->inst_block)
1717 >> ram_in_base_shift_v());
1719 ret = gr_gk20a_submit_fecs_method_op(g,
1720 (struct fecs_method_op_gk20a) {
1722 (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
1723 gr_fecs_current_ctx_target_vid_mem_f() |
1724 gr_fecs_current_ctx_valid_f(1)),
1726 gr_fecs_method_push_adr_restore_golden_v(),
1729 .clr = ~0, .ret = NULL,
1730 .ok = gr_fecs_ctxsw_mailbox_value_pass_v(),
1732 .cond.ok = GR_IS_UCODE_OP_EQUAL,
1733 .cond.fail = GR_IS_UCODE_OP_SKIP}, false);
1736 gk20a_err(dev_from_gk20a(g),
1737 "restore context image failed");
1743 static void gr_gk20a_start_falcon_ucode(struct gk20a *g)
1747 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
1748 gr_fecs_ctxsw_mailbox_clear_value_f(~0));
1750 gk20a_writel(g, gr_gpccs_dmactl_r(), gr_gpccs_dmactl_require_ctx_f(0));
1751 gk20a_writel(g, gr_fecs_dmactl_r(), gr_fecs_dmactl_require_ctx_f(0));
1753 gk20a_writel(g, gr_gpccs_cpuctl_r(), gr_gpccs_cpuctl_startcpu_f(1));
1754 gk20a_writel(g, gr_fecs_cpuctl_r(), gr_fecs_cpuctl_startcpu_f(1));
1756 gk20a_dbg_fn("done");
1759 static int gr_gk20a_init_ctxsw_ucode_vaspace(struct gk20a *g)
1761 struct mm_gk20a *mm = &g->mm;
1762 struct vm_gk20a *vm = &mm->pmu.vm;
1763 struct device *d = dev_from_gk20a(g);
1764 struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
1767 err = gk20a_alloc_inst_block(g, &ucode_info->inst_blk_desc);
1771 gk20a_init_inst_block(&ucode_info->inst_blk_desc, vm, 0);
1773 /* Map ucode surface to GMMU */
1774 ucode_info->surface_desc.gpu_va = gk20a_gmmu_map(vm,
1775 &ucode_info->surface_desc.sgt,
1776 ucode_info->surface_desc.size,
1778 gk20a_mem_flag_read_only,
1780 if (!ucode_info->surface_desc.gpu_va) {
1781 gk20a_err(d, "failed to update gmmu ptes\n");
1788 static void gr_gk20a_init_ctxsw_ucode_segment(
1789 struct gk20a_ctxsw_ucode_segment *p_seg, u32 *offset, u32 size)
1791 p_seg->offset = *offset;
1793 *offset = ALIGN(*offset + size, BLK_SIZE);
1796 static void gr_gk20a_init_ctxsw_ucode_segments(
1797 struct gk20a_ctxsw_ucode_segments *segments, u32 *offset,
1798 struct gk20a_ctxsw_bootloader_desc *bootdesc,
1799 u32 code_size, u32 data_size)
1801 u32 boot_size = ALIGN(bootdesc->size, sizeof(u32));
1802 segments->boot_entry = bootdesc->entry_point;
1803 segments->boot_imem_offset = bootdesc->imem_offset;
1804 gr_gk20a_init_ctxsw_ucode_segment(&segments->boot, offset, boot_size);
1805 gr_gk20a_init_ctxsw_ucode_segment(&segments->code, offset, code_size);
1806 gr_gk20a_init_ctxsw_ucode_segment(&segments->data, offset, data_size);
1809 static int gr_gk20a_copy_ctxsw_ucode_segments(
1811 struct gk20a_ctxsw_ucode_segments *segments,
1813 u32 *code, u32 *data)
1817 memcpy(buf + segments->boot.offset, bootimage, segments->boot.size);
1818 memcpy(buf + segments->code.offset, code, segments->code.size);
1819 memcpy(buf + segments->data.offset, data, segments->data.size);
1821 /* compute a "checksum" for the boot binary to detect its version */
1822 segments->boot_signature = 0;
1823 for (i = 0; i < segments->boot.size / sizeof(u32); i++)
1824 segments->boot_signature += bootimage[i];
1829 int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
1831 struct device *d = dev_from_gk20a(g);
1832 struct mm_gk20a *mm = &g->mm;
1833 struct vm_gk20a *vm = &mm->pmu.vm;
1834 struct gk20a_ctxsw_bootloader_desc *fecs_boot_desc;
1835 struct gk20a_ctxsw_bootloader_desc *gpccs_boot_desc;
1836 const struct firmware *fecs_fw;
1837 const struct firmware *gpccs_fw;
1838 u32 *fecs_boot_image;
1839 u32 *gpccs_boot_image;
1840 struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
1845 fecs_fw = gk20a_request_firmware(g, GK20A_FECS_UCODE_IMAGE);
1847 gk20a_err(d, "failed to load fecs ucode!!");
1851 fecs_boot_desc = (void *)fecs_fw->data;
1852 fecs_boot_image = (void *)(fecs_fw->data +
1853 sizeof(struct gk20a_ctxsw_bootloader_desc));
1855 gpccs_fw = gk20a_request_firmware(g, GK20A_GPCCS_UCODE_IMAGE);
1857 release_firmware(fecs_fw);
1858 gk20a_err(d, "failed to load gpccs ucode!!");
1862 gpccs_boot_desc = (void *)gpccs_fw->data;
1863 gpccs_boot_image = (void *)(gpccs_fw->data +
1864 sizeof(struct gk20a_ctxsw_bootloader_desc));
1867 gr_gk20a_init_ctxsw_ucode_segments(&ucode_info->fecs, &ucode_size,
1869 g->gr.ctx_vars.ucode.fecs.inst.count * sizeof(u32),
1870 g->gr.ctx_vars.ucode.fecs.data.count * sizeof(u32));
1871 gr_gk20a_init_ctxsw_ucode_segments(&ucode_info->gpccs, &ucode_size,
1873 g->gr.ctx_vars.ucode.gpccs.inst.count * sizeof(u32),
1874 g->gr.ctx_vars.ucode.gpccs.data.count * sizeof(u32));
1876 err = gk20a_gmmu_alloc_attr(g, DMA_ATTR_READ_ONLY, ucode_size,
1877 &ucode_info->surface_desc);
1881 buf = (u8 *)ucode_info->surface_desc.cpu_va;
1883 gk20a_err(d, "failed to map surface desc buffer");
1888 gr_gk20a_copy_ctxsw_ucode_segments(buf, &ucode_info->fecs,
1890 g->gr.ctx_vars.ucode.fecs.inst.l,
1891 g->gr.ctx_vars.ucode.fecs.data.l);
1893 release_firmware(fecs_fw);
1896 gr_gk20a_copy_ctxsw_ucode_segments(buf, &ucode_info->gpccs,
1898 g->gr.ctx_vars.ucode.gpccs.inst.l,
1899 g->gr.ctx_vars.ucode.gpccs.data.l);
1901 release_firmware(gpccs_fw);
1904 err = gr_gk20a_init_ctxsw_ucode_vaspace(g);
1911 if (ucode_info->surface_desc.gpu_va)
1912 gk20a_gmmu_unmap(vm, ucode_info->surface_desc.gpu_va,
1913 ucode_info->surface_desc.size, gk20a_mem_flag_none);
1914 gk20a_gmmu_free(g, &ucode_info->surface_desc);
1916 release_firmware(gpccs_fw);
1918 release_firmware(fecs_fw);
1924 void gr_gk20a_load_falcon_bind_instblk(struct gk20a *g)
1926 struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
1928 phys_addr_t inst_ptr;
1931 while ((gk20a_readl(g, gr_fecs_ctxsw_status_1_r()) &
1932 gr_fecs_ctxsw_status_1_arb_busy_m()) && retries) {
1937 gk20a_err(dev_from_gk20a(g),
1938 "arbiter idle timeout, status: %08x",
1939 gk20a_readl(g, gr_fecs_ctxsw_status_1_r()));
1942 gk20a_writel(g, gr_fecs_arb_ctx_adr_r(), 0x0);
1944 inst_ptr = gk20a_mem_phys(&ucode_info->inst_blk_desc);
1945 gk20a_writel(g, gr_fecs_new_ctx_r(),
1946 gr_fecs_new_ctx_ptr_f(inst_ptr >> 12) |
1947 gr_fecs_new_ctx_target_m() |
1948 gr_fecs_new_ctx_valid_m());
1950 gk20a_writel(g, gr_fecs_arb_ctx_ptr_r(),
1951 gr_fecs_arb_ctx_ptr_ptr_f(inst_ptr >> 12) |
1952 gr_fecs_arb_ctx_ptr_target_m());
1954 gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), 0x7);
1956 /* Wait for arbiter command to complete */
1958 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
1959 while (gr_fecs_arb_ctx_cmd_cmd_v(val) && retries) {
1962 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
1965 gk20a_err(dev_from_gk20a(g), "arbiter complete timeout");
1967 gk20a_writel(g, gr_fecs_current_ctx_r(),
1968 gr_fecs_current_ctx_ptr_f(inst_ptr >> 12) |
1969 gr_fecs_current_ctx_target_m() |
1970 gr_fecs_current_ctx_valid_m());
1971 /* Send command to arbiter to flush */
1972 gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), gr_fecs_arb_ctx_cmd_cmd_s());
1975 val = (gk20a_readl(g, gr_fecs_arb_ctx_cmd_r()));
1976 while (gr_fecs_arb_ctx_cmd_cmd_v(val) && retries) {
1979 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
1982 gk20a_err(dev_from_gk20a(g), "arbiter complete timeout");
1985 void gr_gk20a_load_ctxsw_ucode_header(struct gk20a *g, u64 addr_base,
1986 struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset)
1991 addr_code32 = u64_lo32((addr_base + segments->code.offset) >> 8);
1992 addr_data32 = u64_lo32((addr_base + segments->data.offset) >> 8);
1995 * Copy falcon bootloader header into dmem at offset 0.
1996 * Configure dmem port 0 for auto-incrementing writes starting at dmem
1999 gk20a_writel(g, reg_offset + gr_fecs_dmemc_r(0),
2000 gr_fecs_dmemc_offs_f(0) |
2001 gr_fecs_dmemc_blk_f(0) |
2002 gr_fecs_dmemc_aincw_f(1));
2004 /* Write out the actual data */
2005 switch (segments->boot_signature) {
2006 case FALCON_UCODE_SIG_T21X_FECS_WITH_DMEM_SIZE:
2007 case FALCON_UCODE_SIG_T21X_FECS_WITH_RESERVED:
2008 case FALCON_UCODE_SIG_T21X_GPCCS_WITH_RESERVED:
2009 case FALCON_UCODE_SIG_T12X_FECS_WITH_RESERVED:
2010 case FALCON_UCODE_SIG_T12X_GPCCS_WITH_RESERVED:
2011 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2012 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2013 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2014 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2016 case FALCON_UCODE_SIG_T12X_FECS_WITHOUT_RESERVED:
2017 case FALCON_UCODE_SIG_T12X_GPCCS_WITHOUT_RESERVED:
2018 case FALCON_UCODE_SIG_T21X_FECS_WITHOUT_RESERVED:
2019 case FALCON_UCODE_SIG_T21X_FECS_WITHOUT_RESERVED2:
2020 case FALCON_UCODE_SIG_T21X_GPCCS_WITHOUT_RESERVED:
2021 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2022 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2023 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2024 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2025 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 4);
2026 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2028 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2029 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2030 segments->code.size);
2031 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2032 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2033 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2034 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2036 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2037 segments->data.size);
2039 case FALCON_UCODE_SIG_T12X_FECS_OLDER:
2040 case FALCON_UCODE_SIG_T12X_GPCCS_OLDER:
2041 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2042 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2044 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2045 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2046 segments->code.size);
2047 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2048 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2050 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2051 segments->data.size);
2052 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
2054 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2055 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2058 gk20a_err(dev_from_gk20a(g),
2059 "unknown falcon ucode boot signature 0x%08x"
2060 " with reg_offset 0x%08x",
2061 segments->boot_signature, reg_offset);
2066 void gr_gk20a_load_ctxsw_ucode_boot(struct gk20a *g, u64 addr_base,
2067 struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset)
2074 addr_load32 = u64_lo32((addr_base + segments->boot.offset) >> 8);
2075 blocks = ((segments->boot.size + 0xFF) & ~0xFF) >> 8;
2078 * Set the base FB address for the DMA transfer. Subtract off the 256
2079 * byte IMEM block offset such that the relative FB and IMEM offsets
2080 * match, allowing the IMEM tags to be properly created.
2083 dst = segments->boot_imem_offset;
2084 gk20a_writel(g, reg_offset + gr_fecs_dmatrfbase_r(),
2085 (addr_load32 - (dst >> 8)));
2087 for (b = 0; b < blocks; b++) {
2088 /* Setup destination IMEM offset */
2089 gk20a_writel(g, reg_offset + gr_fecs_dmatrfmoffs_r(),
2092 /* Setup source offset (relative to BASE) */
2093 gk20a_writel(g, reg_offset + gr_fecs_dmatrffboffs_r(),
2096 gk20a_writel(g, reg_offset + gr_fecs_dmatrfcmd_r(),
2097 gr_fecs_dmatrfcmd_imem_f(0x01) |
2098 gr_fecs_dmatrfcmd_write_f(0x00) |
2099 gr_fecs_dmatrfcmd_size_f(0x06) |
2100 gr_fecs_dmatrfcmd_ctxdma_f(0));
2103 /* Specify the falcon boot vector */
2104 gk20a_writel(g, reg_offset + gr_fecs_bootvec_r(),
2105 gr_fecs_bootvec_vec_f(segments->boot_entry));
2108 static int gr_gk20a_load_ctxsw_ucode_segments(struct gk20a *g, u64 addr_base,
2109 struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset)
2111 gk20a_writel(g, reg_offset + gr_fecs_dmactl_r(),
2112 gr_fecs_dmactl_require_ctx_f(0));
2114 /* Copy falcon bootloader into dmem */
2115 gr_gk20a_load_ctxsw_ucode_header(g, addr_base, segments, reg_offset);
2116 gr_gk20a_load_ctxsw_ucode_boot(g, addr_base, segments, reg_offset);
2118 /* Write to CPUCTL to start the falcon */
2119 gk20a_writel(g, reg_offset + gr_fecs_cpuctl_r(),
2120 gr_fecs_cpuctl_startcpu_f(0x01));
2125 static void gr_gk20a_load_falcon_with_bootloader(struct gk20a *g)
2127 struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
2128 u64 addr_base = ucode_info->surface_desc.gpu_va;
2130 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0x0);
2132 gr_gk20a_load_falcon_bind_instblk(g);
2134 g->ops.gr.falcon_load_ucode(g, addr_base,
2135 &g->ctxsw_ucode_info.fecs, 0);
2137 g->ops.gr.falcon_load_ucode(g, addr_base,
2138 &g->ctxsw_ucode_info.gpccs,
2139 gr_gpcs_gpccs_falcon_hwcfg_r() -
2140 gr_fecs_falcon_hwcfg_r());
2143 int gr_gk20a_load_ctxsw_ucode(struct gk20a *g)
2149 if (tegra_platform_is_linsim()) {
2150 gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(7),
2151 gr_fecs_ctxsw_mailbox_value_f(0xc0de7777));
2152 gk20a_writel(g, gr_gpccs_ctxsw_mailbox_r(7),
2153 gr_gpccs_ctxsw_mailbox_value_f(0xc0de7777));
2157 * In case bootloader is not supported, revert to the old way of
2158 * loading gr ucode, without the faster bootstrap routine.
2160 if (!g->ops.gr_ctx.use_dma_for_fw_bootstrap) {
2161 gr_gk20a_load_falcon_dmem(g);
2162 gr_gk20a_load_falcon_imem(g);
2163 gr_gk20a_start_falcon_ucode(g);
2165 if (!g->gr.skip_ucode_init) {
2166 err = gr_gk20a_init_ctxsw_ucode(g);
2171 gr_gk20a_load_falcon_with_bootloader(g);
2172 g->gr.skip_ucode_init = true;
2174 gk20a_dbg_fn("done");
2178 static int gr_gk20a_wait_ctxsw_ready(struct gk20a *g)
2184 ret = gr_gk20a_ctx_wait_ucode(g, 0, NULL,
2185 GR_IS_UCODE_OP_EQUAL,
2186 eUcodeHandshakeInitComplete,
2187 GR_IS_UCODE_OP_SKIP, 0, false);
2189 gk20a_err(dev_from_gk20a(g), "falcon ucode init timeout");
2193 if (g->ops.gr_ctx.use_dma_for_fw_bootstrap || g->ops.securegpccs)
2194 gk20a_writel(g, gr_fecs_current_ctx_r(),
2195 gr_fecs_current_ctx_valid_false_f());
2197 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0xffffffff);
2198 gk20a_writel(g, gr_fecs_method_data_r(), 0x7fffffff);
2199 gk20a_writel(g, gr_fecs_method_push_r(),
2200 gr_fecs_method_push_adr_set_watchdog_timeout_f());
2202 gk20a_dbg_fn("done");
2206 int gr_gk20a_init_ctx_state(struct gk20a *g)
2208 u32 pm_ctx_image_size;
2210 struct fecs_method_op_gk20a op = {
2211 .mailbox = { .id = 0, .data = 0,
2212 .clr = ~0, .ok = 0, .fail = 0},
2214 .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
2215 .cond.fail = GR_IS_UCODE_OP_SKIP,
2219 if (!g->gr.ctx_vars.golden_image_size) {
2221 gr_fecs_method_push_adr_discover_image_size_v();
2222 op.mailbox.ret = &g->gr.ctx_vars.golden_image_size;
2223 ret = gr_gk20a_submit_fecs_method_op(g, op, false);
2225 gk20a_err(dev_from_gk20a(g),
2226 "query golden image size failed");
2230 gr_fecs_method_push_adr_discover_zcull_image_size_v();
2231 op.mailbox.ret = &g->gr.ctx_vars.zcull_ctxsw_image_size;
2232 ret = gr_gk20a_submit_fecs_method_op(g, op, false);
2234 gk20a_err(dev_from_gk20a(g),
2235 "query zcull ctx image size failed");
2239 gr_fecs_method_push_adr_discover_pm_image_size_v();
2240 op.mailbox.ret = &pm_ctx_image_size;
2241 ret = gr_gk20a_submit_fecs_method_op(g, op, false);
2243 gk20a_err(dev_from_gk20a(g),
2244 "query pm ctx image size failed");
2247 g->gr.ctx_vars.priv_access_map_size = 512 * 1024;
2250 gk20a_dbg_fn("done");
2254 static void gk20a_gr_destroy_ctx_buffer(struct gk20a *g,
2255 struct gr_ctx_buffer_desc *desc)
2259 gk20a_gmmu_free_attr(g, DMA_ATTR_NO_KERNEL_MAPPING, &desc->mem);
2262 static int gk20a_gr_alloc_ctx_buffer(struct gk20a *g,
2263 struct gr_ctx_buffer_desc *desc,
2268 err = gk20a_gmmu_alloc_attr(g, DMA_ATTR_NO_KERNEL_MAPPING,
2273 desc->destroy = gk20a_gr_destroy_ctx_buffer;
2278 static int gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g)
2280 struct gk20a_platform *platform = platform_get_drvdata(g->dev);
2281 struct gr_gk20a *gr = &g->gr;
2282 int i, attr_buffer_size, err;
2283 struct platform_device *pdev = g->dev;
2285 u32 cb_buffer_size = gr->bundle_cb_default_size *
2286 gr_scc_bundle_cb_size_div_256b_byte_granularity_v();
2288 u32 pagepool_buffer_size = g->ops.gr.pagepool_default_size(g) *
2289 gr_scc_pagepool_total_pages_byte_granularity_v();
2293 attr_buffer_size = g->ops.gr.calc_global_ctx_buffer_size(g);
2295 gk20a_dbg_info("cb_buffer_size : %d", cb_buffer_size);
2297 err = gk20a_gr_alloc_ctx_buffer(g, &gr->global_ctx_buffer[CIRCULAR],
2302 if (platform->secure_alloc)
2303 platform->secure_alloc(pdev,
2304 &gr->global_ctx_buffer[CIRCULAR_VPR],
2307 gk20a_dbg_info("pagepool_buffer_size : %d", pagepool_buffer_size);
2309 err = gk20a_gr_alloc_ctx_buffer(g, &gr->global_ctx_buffer[PAGEPOOL],
2310 pagepool_buffer_size);
2314 if (platform->secure_alloc)
2315 platform->secure_alloc(pdev,
2316 &gr->global_ctx_buffer[PAGEPOOL_VPR],
2317 pagepool_buffer_size);
2319 gk20a_dbg_info("attr_buffer_size : %d", attr_buffer_size);
2321 err = gk20a_gr_alloc_ctx_buffer(g, &gr->global_ctx_buffer[ATTRIBUTE],
2326 if (platform->secure_alloc)
2327 platform->secure_alloc(pdev,
2328 &gr->global_ctx_buffer[ATTRIBUTE_VPR],
2331 if (platform->secure_buffer.destroy)
2332 platform->secure_buffer.destroy(pdev, &platform->secure_buffer);
2334 gk20a_dbg_info("golden_image_size : %d",
2335 gr->ctx_vars.golden_image_size);
2337 err = gk20a_gr_alloc_ctx_buffer(g,
2338 &gr->global_ctx_buffer[GOLDEN_CTX],
2339 gr->ctx_vars.golden_image_size);
2343 gk20a_dbg_info("priv_access_map_size : %d",
2344 gr->ctx_vars.priv_access_map_size);
2346 err = gk20a_gr_alloc_ctx_buffer(g,
2347 &gr->global_ctx_buffer[PRIV_ACCESS_MAP],
2348 gr->ctx_vars.priv_access_map_size);
2353 gk20a_dbg_fn("done");
2357 gk20a_err(dev_from_gk20a(g), "fail");
2358 for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
2359 if (gr->global_ctx_buffer[i].destroy) {
2360 gr->global_ctx_buffer[i].destroy(g,
2361 &gr->global_ctx_buffer[i]);
2367 static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g)
2369 struct gr_gk20a *gr = &g->gr;
2370 DEFINE_DMA_ATTRS(attrs);
2373 dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2375 for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
2376 gr->global_ctx_buffer[i].destroy(g,
2377 &gr->global_ctx_buffer[i]);
2380 gk20a_dbg_fn("done");
2383 static int gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
2384 struct channel_gk20a *c)
2386 struct vm_gk20a *ch_vm = c->vm;
2387 u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
2388 u64 *g_bfr_size = c->ch_ctx.global_ctx_buffer_size;
2389 struct gr_gk20a *gr = &g->gr;
2390 struct sg_table *sgt;
2396 /* Circular Buffer */
2397 if (!c->vpr || (gr->global_ctx_buffer[CIRCULAR_VPR].mem.sgt == NULL)) {
2398 sgt = gr->global_ctx_buffer[CIRCULAR].mem.sgt;
2399 size = gr->global_ctx_buffer[CIRCULAR].mem.size;
2401 sgt = gr->global_ctx_buffer[CIRCULAR_VPR].mem.sgt;
2402 size = gr->global_ctx_buffer[CIRCULAR_VPR].mem.size;
2405 gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size,
2406 NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
2407 gk20a_mem_flag_none, true);
2410 g_bfr_va[CIRCULAR_VA] = gpu_va;
2411 g_bfr_size[CIRCULAR_VA] = size;
2413 /* Attribute Buffer */
2414 if (!c->vpr || (gr->global_ctx_buffer[ATTRIBUTE_VPR].mem.sgt == NULL)) {
2415 sgt = gr->global_ctx_buffer[ATTRIBUTE].mem.sgt;
2416 size = gr->global_ctx_buffer[ATTRIBUTE].mem.size;
2418 sgt = gr->global_ctx_buffer[ATTRIBUTE_VPR].mem.sgt;
2419 size = gr->global_ctx_buffer[ATTRIBUTE_VPR].mem.size;
2422 gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size,
2423 NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
2424 gk20a_mem_flag_none, false);
2427 g_bfr_va[ATTRIBUTE_VA] = gpu_va;
2428 g_bfr_size[ATTRIBUTE_VA] = size;
2431 if (!c->vpr || (gr->global_ctx_buffer[PAGEPOOL_VPR].mem.sgt == NULL)) {
2432 sgt = gr->global_ctx_buffer[PAGEPOOL].mem.sgt;
2433 size = gr->global_ctx_buffer[PAGEPOOL].mem.size;
2435 sgt = gr->global_ctx_buffer[PAGEPOOL_VPR].mem.sgt;
2436 size = gr->global_ctx_buffer[PAGEPOOL_VPR].mem.size;
2439 gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size,
2440 NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
2441 gk20a_mem_flag_none, true);
2444 g_bfr_va[PAGEPOOL_VA] = gpu_va;
2445 g_bfr_size[PAGEPOOL_VA] = size;
2448 sgt = gr->global_ctx_buffer[GOLDEN_CTX].mem.sgt;
2449 size = gr->global_ctx_buffer[GOLDEN_CTX].mem.size;
2450 gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size, 0,
2451 gk20a_mem_flag_none, true);
2454 g_bfr_va[GOLDEN_CTX_VA] = gpu_va;
2455 g_bfr_size[GOLDEN_CTX_VA] = size;
2457 /* Priv register Access Map */
2458 sgt = gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem.sgt;
2459 size = gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem.size;
2460 gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size, 0,
2461 gk20a_mem_flag_none, true);
2464 g_bfr_va[PRIV_ACCESS_MAP_VA] = gpu_va;
2465 g_bfr_size[PRIV_ACCESS_MAP_VA] = size;
2467 c->ch_ctx.global_ctx_buffer_mapped = true;
2471 for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
2473 gk20a_gmmu_unmap(ch_vm, g_bfr_va[i],
2474 gr->global_ctx_buffer[i].mem.size,
2475 gk20a_mem_flag_none);
2482 static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c)
2484 struct vm_gk20a *ch_vm = c->vm;
2485 u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
2486 u64 *g_bfr_size = c->ch_ctx.global_ctx_buffer_size;
2491 for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
2493 gk20a_gmmu_unmap(ch_vm, g_bfr_va[i],
2495 gk20a_mem_flag_none);
2500 c->ch_ctx.global_ctx_buffer_mapped = false;
2503 int gr_gk20a_alloc_gr_ctx(struct gk20a *g,
2504 struct gr_ctx_desc **__gr_ctx, struct vm_gk20a *vm,
2508 struct gr_ctx_desc *gr_ctx = NULL;
2509 struct gr_gk20a *gr = &g->gr;
2514 if (gr->ctx_vars.buffer_size == 0)
2517 /* alloc channel gr ctx buffer */
2518 gr->ctx_vars.buffer_size = gr->ctx_vars.golden_image_size;
2519 gr->ctx_vars.buffer_total_size = gr->ctx_vars.golden_image_size;
2521 gr_ctx = kzalloc(sizeof(*gr_ctx), GFP_KERNEL);
2525 err = gk20a_gmmu_alloc_attr(g, DMA_ATTR_NO_KERNEL_MAPPING,
2526 gr->ctx_vars.buffer_total_size,
2531 gr_ctx->mem.gpu_va = gk20a_gmmu_map(vm, &gr_ctx->mem.sgt, gr_ctx->mem.size,
2532 NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
2533 gk20a_mem_flag_none, true);
2534 if (!gr_ctx->mem.gpu_va)
2542 gk20a_gmmu_free_attr(g, DMA_ATTR_NO_KERNEL_MAPPING, &gr_ctx->mem);
2550 static int gr_gk20a_alloc_tsg_gr_ctx(struct gk20a *g,
2551 struct tsg_gk20a *tsg, u32 class, u32 padding)
2553 struct gr_ctx_desc **gr_ctx = &tsg->tsg_gr_ctx;
2557 gk20a_err(dev_from_gk20a(tsg->g), "No address space bound\n");
2561 err = g->ops.gr.alloc_gr_ctx(g, gr_ctx, tsg->vm, class, padding);
2568 static int gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
2569 struct channel_gk20a *c,
2573 struct gr_ctx_desc **gr_ctx = &c->ch_ctx.gr_ctx;
2574 int err = g->ops.gr.alloc_gr_ctx(g, gr_ctx, c->vm, class, padding);
2581 void gr_gk20a_free_gr_ctx(struct gk20a *g,
2582 struct vm_gk20a *vm, struct gr_ctx_desc *gr_ctx)
2586 if (!gr_ctx || !gr_ctx->mem.gpu_va)
2589 gk20a_gmmu_unmap(vm, gr_ctx->mem.gpu_va,
2590 gr_ctx->mem.size, gk20a_mem_flag_none);
2591 gk20a_gmmu_free_attr(g, DMA_ATTR_NO_KERNEL_MAPPING, &gr_ctx->mem);
2595 void gr_gk20a_free_tsg_gr_ctx(struct tsg_gk20a *tsg)
2598 gk20a_err(dev_from_gk20a(tsg->g), "No address space bound\n");
2601 tsg->g->ops.gr.free_gr_ctx(tsg->g, tsg->vm, tsg->tsg_gr_ctx);
2602 tsg->tsg_gr_ctx = NULL;
2605 static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c)
2607 c->g->ops.gr.free_gr_ctx(c->g, c->vm, c->ch_ctx.gr_ctx);
2608 c->ch_ctx.gr_ctx = NULL;
2611 static int gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
2612 struct channel_gk20a *c)
2614 struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2615 struct vm_gk20a *ch_vm = c->vm;
2620 err = gk20a_gmmu_alloc_map_attr(ch_vm, DMA_ATTR_NO_KERNEL_MAPPING,
2621 128 * sizeof(u32), &patch_ctx->mem);
2625 gk20a_dbg_fn("done");
2629 static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c)
2631 struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2632 struct gk20a *g = c->g;
2636 if (patch_ctx->mem.gpu_va)
2637 gk20a_gmmu_unmap(c->vm, patch_ctx->mem.gpu_va,
2638 patch_ctx->mem.size, gk20a_mem_flag_none);
2640 gk20a_gmmu_free_attr(g, DMA_ATTR_NO_KERNEL_MAPPING, &patch_ctx->mem);
2641 patch_ctx->data_count = 0;
2644 void gk20a_free_channel_ctx(struct channel_gk20a *c)
2646 gr_gk20a_unmap_global_ctx_buffers(c);
2647 gr_gk20a_free_channel_patch_ctx(c);
2648 if (!gk20a_is_channel_marked_as_tsg(c))
2649 gr_gk20a_free_channel_gr_ctx(c);
2651 /* zcull_ctx, pm_ctx */
2653 memset(&c->ch_ctx, 0, sizeof(struct channel_ctx_gk20a));
2656 c->first_init = false;
2659 static bool gr_gk20a_is_valid_class(struct gk20a *g, u32 class_num)
2663 switch (class_num) {
2664 case KEPLER_COMPUTE_A:
2667 case KEPLER_DMA_COPY_A:
2678 int gk20a_alloc_obj_ctx(struct channel_gk20a *c,
2679 struct nvgpu_alloc_obj_ctx_args *args)
2681 struct gk20a *g = c->g;
2682 struct fifo_gk20a *f = &g->fifo;
2683 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
2684 struct tsg_gk20a *tsg = NULL;
2689 /* an address space needs to have been bound at this point.*/
2690 if (!gk20a_channel_as_bound(c) && !c->vm) {
2691 gk20a_err(dev_from_gk20a(g),
2692 "not bound to address space at time"
2693 " of grctx allocation");
2697 if (!g->ops.gr.is_valid_class(g, args->class_num)) {
2698 gk20a_err(dev_from_gk20a(g),
2699 "invalid obj class 0x%x", args->class_num);
2703 c->obj_class = args->class_num;
2705 if (gk20a_is_channel_marked_as_tsg(c))
2706 tsg = &f->tsg[c->tsgid];
2708 /* allocate gr ctx buffer */
2710 if (!ch_ctx->gr_ctx) {
2711 err = gr_gk20a_alloc_channel_gr_ctx(g, c,
2715 gk20a_err(dev_from_gk20a(g),
2716 "fail to allocate gr ctx buffer");
2720 /*TBD: needs to be more subtle about which is
2721 * being allocated as some are allowed to be
2722 * allocated along same channel */
2723 gk20a_err(dev_from_gk20a(g),
2724 "too many classes alloc'd on same channel");
2729 if (!tsg->tsg_gr_ctx) {
2731 gk20a_vm_get(tsg->vm);
2732 err = gr_gk20a_alloc_tsg_gr_ctx(g, tsg,
2736 gk20a_err(dev_from_gk20a(g),
2737 "fail to allocate TSG gr ctx buffer");
2738 gk20a_vm_put(tsg->vm);
2743 ch_ctx->gr_ctx = tsg->tsg_gr_ctx;
2746 /* commit gr ctx buffer */
2747 err = gr_gk20a_commit_inst(c, ch_ctx->gr_ctx->mem.gpu_va);
2749 gk20a_err(dev_from_gk20a(g),
2750 "fail to commit gr ctx buffer");
2754 /* allocate patch buffer */
2755 if (ch_ctx->patch_ctx.mem.sgt == NULL) {
2756 err = gr_gk20a_alloc_channel_patch_ctx(g, c);
2758 gk20a_err(dev_from_gk20a(g),
2759 "fail to allocate patch buffer");
2764 /* map global buffer to channel gpu_va and commit */
2765 if (!ch_ctx->global_ctx_buffer_mapped) {
2766 err = gr_gk20a_map_global_ctx_buffers(g, c);
2768 gk20a_err(dev_from_gk20a(g),
2769 "fail to map global ctx buffer");
2772 gr_gk20a_elpg_protected_call(g,
2773 gr_gk20a_commit_global_ctx_buffers(g, c, true));
2776 /* tweak any perf parameters per-context here */
2777 if (args->class_num == KEPLER_COMPUTE_A) {
2779 u32 tex_lock_disable_mask;
2784 if (support_gk20a_pmu(g->dev)) {
2785 err = gk20a_pmu_disable_elpg(g);
2787 gk20a_err(dev_from_gk20a(g),
2788 "failed to set disable elpg");
2792 tex_lock_disable_mask =
2793 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_m() |
2794 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tile_m() |
2795 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_phase_m() |
2796 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tex_m() |
2797 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_timeout_m() |
2798 gr_gpcs_tpcs_sm_sch_texlock_dot_t_unlock_m();
2800 texlock = gk20a_readl(g, gr_gpcs_tpcs_sm_sch_texlock_r());
2802 texlock = (texlock & ~tex_lock_disable_mask) |
2803 (gr_gpcs_tpcs_sm_sch_texlock_tex_hash_disable_f() |
2804 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tile_disable_f() |
2805 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_phase_disable_f() |
2806 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tex_disable_f() |
2807 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_timeout_disable_f() |
2808 gr_gpcs_tpcs_sm_sch_texlock_dot_t_unlock_disable_f());
2811 gr_gpcs_tpcs_sm_sch_macro_sched_lockboost_size_m();
2813 lockboost = gk20a_readl(g, gr_gpcs_tpcs_sm_sch_macro_sched_r());
2814 lockboost = (lockboost & ~lockboost_mask) |
2815 gr_gpcs_tpcs_sm_sch_macro_sched_lockboost_size_f(0);
2817 begin_err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
2820 err = gr_gk20a_ctx_patch_write(g, ch_ctx,
2821 gr_gpcs_tpcs_sm_sch_texlock_r(),
2825 err = gr_gk20a_ctx_patch_write(g, ch_ctx,
2826 gr_gpcs_tpcs_sm_sch_macro_sched_r(),
2829 if ((begin_err || err)) {
2830 gk20a_err(dev_from_gk20a(g),
2831 "failed to set texlock for compute class");
2834 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
2836 args->flags |= NVGPU_ALLOC_OBJ_FLAGS_LOCKBOOST_ZERO;
2838 if (support_gk20a_pmu(g->dev))
2839 gk20a_pmu_enable_elpg(g);
2842 /* init golden image, ELPG enabled after this is done */
2843 err = gr_gk20a_init_golden_ctx_image(g, c);
2845 gk20a_err(dev_from_gk20a(g),
2846 "fail to init golden ctx image");
2850 /* load golden image */
2851 if (!c->first_init) {
2852 err = gr_gk20a_elpg_protected_call(g,
2853 gr_gk20a_load_golden_ctx_image(g, c));
2855 gk20a_err(dev_from_gk20a(g),
2856 "fail to load golden ctx image");
2859 c->first_init = true;
2864 gk20a_dbg_fn("done");
2867 /* 1. gr_ctx, patch_ctx and global ctx buffer mapping
2868 can be reused so no need to release them.
2869 2. golden image init and load is a one time thing so if
2870 they pass, no need to undo. */
2871 gk20a_err(dev_from_gk20a(g), "fail");
2875 int gk20a_free_obj_ctx(struct channel_gk20a *c,
2876 struct nvgpu_free_obj_ctx_args *args)
2880 if (c->num_objects == 0)
2885 if (c->num_objects == 0) {
2886 c->first_init = false;
2887 gk20a_disable_channel(c);
2888 gr_gk20a_free_channel_patch_ctx(c);
2894 int gk20a_comptag_allocator_init(struct gk20a_comptag_allocator *allocator,
2897 mutex_init(&allocator->lock);
2899 * 0th comptag is special and is never used. The base for this bitmap
2900 * is 1, and its size is one less than the size of comptag store.
2903 allocator->bitmap = vzalloc(BITS_TO_LONGS(size) * sizeof(long));
2904 if (!allocator->bitmap)
2906 allocator->size = size;
2910 void gk20a_comptag_allocator_destroy(struct gk20a_comptag_allocator *allocator)
2913 * called only when exiting the driver (gk20a_remove, or unwinding the
2914 * init stage); no users should be active, so taking the mutex is
2917 allocator->size = 0;
2918 vfree(allocator->bitmap);
2921 static void gk20a_remove_gr_support(struct gr_gk20a *gr)
2923 struct gk20a *g = gr->g;
2927 gr_gk20a_free_cyclestats_snapshot_data(g);
2929 gr_gk20a_free_global_ctx_buffers(g);
2931 gk20a_gmmu_free(g, &gr->mmu_wr_mem);
2932 gk20a_gmmu_free(g, &gr->mmu_rd_mem);
2934 gk20a_gmmu_free_attr(g, DMA_ATTR_NO_KERNEL_MAPPING,
2935 &gr->compbit_store.mem);
2937 memset(&gr->compbit_store, 0, sizeof(struct compbit_store_desc));
2939 kfree(gr->gpc_tpc_count);
2940 kfree(gr->gpc_zcb_count);
2941 kfree(gr->gpc_ppc_count);
2942 kfree(gr->pes_tpc_count[0]);
2943 kfree(gr->pes_tpc_count[1]);
2944 kfree(gr->pes_tpc_mask[0]);
2945 kfree(gr->pes_tpc_mask[1]);
2946 kfree(gr->sm_to_cluster);
2947 kfree(gr->gpc_skip_mask);
2948 kfree(gr->map_tiles);
2949 gr->gpc_tpc_count = NULL;
2950 gr->gpc_zcb_count = NULL;
2951 gr->gpc_ppc_count = NULL;
2952 gr->pes_tpc_count[0] = NULL;
2953 gr->pes_tpc_count[1] = NULL;
2954 gr->pes_tpc_mask[0] = NULL;
2955 gr->pes_tpc_mask[1] = NULL;
2956 gr->gpc_skip_mask = NULL;
2957 gr->map_tiles = NULL;
2959 kfree(gr->ctx_vars.ucode.fecs.inst.l);
2960 kfree(gr->ctx_vars.ucode.fecs.data.l);
2961 kfree(gr->ctx_vars.ucode.gpccs.inst.l);
2962 kfree(gr->ctx_vars.ucode.gpccs.data.l);
2963 kfree(gr->ctx_vars.sw_bundle_init.l);
2964 kfree(gr->ctx_vars.sw_method_init.l);
2965 kfree(gr->ctx_vars.sw_ctx_load.l);
2966 kfree(gr->ctx_vars.sw_non_ctx_load.l);
2967 kfree(gr->ctx_vars.ctxsw_regs.sys.l);
2968 kfree(gr->ctx_vars.ctxsw_regs.gpc.l);
2969 kfree(gr->ctx_vars.ctxsw_regs.tpc.l);
2970 kfree(gr->ctx_vars.ctxsw_regs.zcull_gpc.l);
2971 kfree(gr->ctx_vars.ctxsw_regs.ppc.l);
2972 kfree(gr->ctx_vars.ctxsw_regs.pm_sys.l);
2973 kfree(gr->ctx_vars.ctxsw_regs.pm_gpc.l);
2974 kfree(gr->ctx_vars.ctxsw_regs.pm_tpc.l);
2976 kfree(gr->ctx_vars.local_golden_image);
2977 gr->ctx_vars.local_golden_image = NULL;
2979 gk20a_comptag_allocator_destroy(&gr->comp_tags);
2982 static void gr_gk20a_bundle_cb_defaults(struct gk20a *g)
2984 struct gr_gk20a *gr = &g->gr;
2986 gr->bundle_cb_default_size =
2987 gr_scc_bundle_cb_size_div_256b__prod_v();
2988 gr->min_gpm_fifo_depth =
2989 gr_pd_ab_dist_cfg2_state_limit_min_gpm_fifo_depths_v();
2990 gr->bundle_cb_token_limit =
2991 gr_pd_ab_dist_cfg2_token_limit_init_v();
2994 static int gr_gk20a_init_gr_config(struct gk20a *g, struct gr_gk20a *gr)
2996 u32 gpc_index, pes_index;
2999 u32 pes_heavy_index;
3000 u32 gpc_new_skip_mask;
3003 tmp = gk20a_readl(g, pri_ringmaster_enum_fbp_r());
3004 gr->num_fbps = pri_ringmaster_enum_fbp_count_v(tmp);
3006 tmp = gk20a_readl(g, top_num_gpcs_r());
3007 gr->max_gpc_count = top_num_gpcs_value_v(tmp);
3009 tmp = gk20a_readl(g, top_num_fbps_r());
3010 gr->max_fbps_count = top_num_fbps_value_v(tmp);
3012 gr->fbp_en_mask = g->ops.gr.get_fbp_en_mask(g);
3014 tmp = gk20a_readl(g, top_tpc_per_gpc_r());
3015 gr->max_tpc_per_gpc_count = top_tpc_per_gpc_value_v(tmp);
3017 gr->max_tpc_count = gr->max_gpc_count * gr->max_tpc_per_gpc_count;
3019 tmp = gk20a_readl(g, top_num_fbps_r());
3020 gr->sys_count = top_num_fbps_value_v(tmp);
3022 tmp = gk20a_readl(g, pri_ringmaster_enum_gpc_r());
3023 gr->gpc_count = pri_ringmaster_enum_gpc_count_v(tmp);
3025 gr->pe_count_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
3026 gr->max_zcull_per_gpc_count = proj_scal_litter_num_zcull_banks_v();
3028 if (!gr->gpc_count) {
3029 gk20a_err(dev_from_gk20a(g), "gpc_count==0!");
3033 gr->gpc_tpc_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
3034 gr->gpc_tpc_mask = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
3035 gr->gpc_zcb_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
3036 gr->gpc_ppc_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
3037 gr->pes_tpc_count[0] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
3038 gr->pes_tpc_count[1] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
3039 gr->pes_tpc_mask[0] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
3040 gr->pes_tpc_mask[1] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
3043 kzalloc(gr_pd_dist_skip_table__size_1_v() * 4 * sizeof(u32),
3046 if (!gr->gpc_tpc_count || !gr->gpc_zcb_count || !gr->gpc_ppc_count ||
3047 !gr->pes_tpc_count[0] || !gr->pes_tpc_count[1] ||
3048 !gr->pes_tpc_mask[0] || !gr->pes_tpc_mask[1] || !gr->gpc_skip_mask)
3053 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3054 tmp = gk20a_readl(g, gr_gpc0_fs_gpc_r());
3056 gr->gpc_tpc_count[gpc_index] =
3057 gr_gpc0_fs_gpc_num_available_tpcs_v(tmp);
3058 gr->tpc_count += gr->gpc_tpc_count[gpc_index];
3060 gr->gpc_zcb_count[gpc_index] =
3061 gr_gpc0_fs_gpc_num_available_zculls_v(tmp);
3062 gr->zcb_count += gr->gpc_zcb_count[gpc_index];
3064 gr->gpc_ppc_count[gpc_index] = gr->pe_count_per_gpc;
3065 gr->ppc_count += gr->gpc_ppc_count[gpc_index];
3067 if (g->ops.gr.get_gpc_tpc_mask)
3068 gr->gpc_tpc_mask[gpc_index] =
3069 g->ops.gr.get_gpc_tpc_mask(g, gpc_index);
3071 for (pes_index = 0; pes_index < gr->pe_count_per_gpc; pes_index++) {
3073 tmp = gk20a_readl(g,
3074 gr_gpc0_gpm_pd_pes_tpc_id_mask_r(pes_index) +
3075 gpc_index * proj_gpc_stride_v());
3077 pes_tpc_mask = gr_gpc0_gpm_pd_pes_tpc_id_mask_mask_v(tmp);
3078 pes_tpc_count = count_bits(pes_tpc_mask);
3080 gr->pes_tpc_count[pes_index][gpc_index] = pes_tpc_count;
3081 gr->pes_tpc_mask[pes_index][gpc_index] = pes_tpc_mask;
3084 gpc_new_skip_mask = 0;
3085 if (gr->pes_tpc_count[0][gpc_index] +
3086 gr->pes_tpc_count[1][gpc_index] == 5) {
3088 gr->pes_tpc_count[0][gpc_index] >
3089 gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
3092 gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
3093 (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
3094 (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
3096 } else if ((gr->pes_tpc_count[0][gpc_index] +
3097 gr->pes_tpc_count[1][gpc_index] == 4) &&
3098 (gr->pes_tpc_count[0][gpc_index] !=
3099 gr->pes_tpc_count[1][gpc_index])) {
3101 gr->pes_tpc_count[0][gpc_index] >
3102 gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
3105 gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
3106 (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
3107 (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
3109 gr->gpc_skip_mask[gpc_index] = gpc_new_skip_mask;
3112 gr->sm_to_cluster = kzalloc(gr->gpc_count * gr->tpc_count *
3113 sizeof(struct sm_info), GFP_KERNEL);
3116 gk20a_dbg_info("fbps: %d", gr->num_fbps);
3117 gk20a_dbg_info("max_gpc_count: %d", gr->max_gpc_count);
3118 gk20a_dbg_info("max_fbps_count: %d", gr->max_fbps_count);
3119 gk20a_dbg_info("max_tpc_per_gpc_count: %d", gr->max_tpc_per_gpc_count);
3120 gk20a_dbg_info("max_zcull_per_gpc_count: %d", gr->max_zcull_per_gpc_count);
3121 gk20a_dbg_info("max_tpc_count: %d", gr->max_tpc_count);
3122 gk20a_dbg_info("sys_count: %d", gr->sys_count);
3123 gk20a_dbg_info("gpc_count: %d", gr->gpc_count);
3124 gk20a_dbg_info("pe_count_per_gpc: %d", gr->pe_count_per_gpc);
3125 gk20a_dbg_info("tpc_count: %d", gr->tpc_count);
3126 gk20a_dbg_info("ppc_count: %d", gr->ppc_count);
3128 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3129 gk20a_dbg_info("gpc_tpc_count[%d] : %d",
3130 gpc_index, gr->gpc_tpc_count[gpc_index]);
3131 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3132 gk20a_dbg_info("gpc_zcb_count[%d] : %d",
3133 gpc_index, gr->gpc_zcb_count[gpc_index]);
3134 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3135 gk20a_dbg_info("gpc_ppc_count[%d] : %d",
3136 gpc_index, gr->gpc_ppc_count[gpc_index]);
3137 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3138 gk20a_dbg_info("gpc_skip_mask[%d] : %d",
3139 gpc_index, gr->gpc_skip_mask[gpc_index]);
3140 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3142 pes_index < gr->pe_count_per_gpc;
3144 gk20a_dbg_info("pes_tpc_count[%d][%d] : %d",
3145 pes_index, gpc_index,
3146 gr->pes_tpc_count[pes_index][gpc_index]);
3148 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3150 pes_index < gr->pe_count_per_gpc;
3152 gk20a_dbg_info("pes_tpc_mask[%d][%d] : %d",
3153 pes_index, gpc_index,
3154 gr->pes_tpc_mask[pes_index][gpc_index]);
3156 g->ops.gr.bundle_cb_defaults(g);
3157 g->ops.gr.cb_size_default(g);
3158 g->ops.gr.calc_global_ctx_buffer_size(g);
3159 gr->timeslice_mode = gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v();
3161 gk20a_dbg_info("bundle_cb_default_size: %d",
3162 gr->bundle_cb_default_size);
3163 gk20a_dbg_info("min_gpm_fifo_depth: %d", gr->min_gpm_fifo_depth);
3164 gk20a_dbg_info("bundle_cb_token_limit: %d", gr->bundle_cb_token_limit);
3165 gk20a_dbg_info("attrib_cb_default_size: %d",
3166 gr->attrib_cb_default_size);
3167 gk20a_dbg_info("attrib_cb_size: %d", gr->attrib_cb_size);
3168 gk20a_dbg_info("alpha_cb_default_size: %d", gr->alpha_cb_default_size);
3169 gk20a_dbg_info("alpha_cb_size: %d", gr->alpha_cb_size);
3170 gk20a_dbg_info("timeslice_mode: %d", gr->timeslice_mode);
3178 static int gr_gk20a_init_mmu_sw(struct gk20a *g, struct gr_gk20a *gr)
3182 err = gk20a_gmmu_alloc(g, 0x1000, &gr->mmu_wr_mem);
3186 err = gk20a_gmmu_alloc(g, 0x1000, &gr->mmu_rd_mem);
3188 goto err_free_wr_mem;
3192 gk20a_gmmu_free(g, &gr->mmu_wr_mem);
3197 static u32 prime_set[18] = {
3198 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61 };
3200 static int gr_gk20a_init_map_tiles(struct gk20a *g, struct gr_gk20a *gr)
3204 s32 *init_frac = NULL;
3205 s32 *init_err = NULL;
3206 s32 *run_err = NULL;
3207 s32 *sorted_num_tpcs = NULL;
3208 s32 *sorted_to_unsorted_gpc_map = NULL;
3212 u32 max_tpc_count = 0;
3216 bool delete_map = false;
3220 init_frac = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3221 init_err = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3222 run_err = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3224 kzalloc(proj_scal_max_gpcs_v() *
3225 proj_scal_max_tpc_per_gpc_v() * sizeof(s32),
3227 sorted_to_unsorted_gpc_map =
3228 kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3230 if (!(init_frac && init_err && run_err && sorted_num_tpcs &&
3231 sorted_to_unsorted_gpc_map)) {
3236 gr->map_row_offset = INVALID_SCREEN_TILE_ROW_OFFSET;
3238 if (gr->tpc_count == 3)
3239 gr->map_row_offset = 2;
3240 else if (gr->tpc_count < 3)
3241 gr->map_row_offset = 1;
3243 gr->map_row_offset = 3;
3245 for (index = 1; index < 18; index++) {
3246 u32 prime = prime_set[index];
3247 if ((gr->tpc_count % prime) != 0) {
3248 gr->map_row_offset = prime;
3254 switch (gr->tpc_count) {
3256 gr->map_row_offset = 6;
3259 gr->map_row_offset = 5;
3262 gr->map_row_offset = 2;
3265 gr->map_row_offset = 7;
3268 gr->map_row_offset = 6;
3272 gr->map_row_offset = 1;
3278 if (gr->map_tiles) {
3279 if (gr->map_tile_count != gr->tpc_count)
3282 for (tile_count = 0; tile_count < gr->map_tile_count; tile_count++) {
3283 if ((u32)gr->map_tiles[tile_count] >= gr->tpc_count)
3288 kfree(gr->map_tiles);
3289 gr->map_tiles = NULL;
3290 gr->map_tile_count = 0;
3294 if (gr->map_tiles == NULL) {
3295 gr->map_tile_count = proj_scal_max_gpcs_v();
3297 gr->map_tiles = kzalloc(proj_scal_max_gpcs_v() * sizeof(u8), GFP_KERNEL);
3298 if (gr->map_tiles == NULL) {
3303 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3304 sorted_num_tpcs[gpc_index] = gr->gpc_tpc_count[gpc_index];
3305 sorted_to_unsorted_gpc_map[gpc_index] = gpc_index;
3309 while (!gpc_sorted) {
3311 for (gpc_index = 0; gpc_index < gr->gpc_count - 1; gpc_index++) {
3312 if (sorted_num_tpcs[gpc_index + 1] > sorted_num_tpcs[gpc_index]) {
3314 swap = sorted_num_tpcs[gpc_index];
3315 sorted_num_tpcs[gpc_index] = sorted_num_tpcs[gpc_index + 1];
3316 sorted_num_tpcs[gpc_index + 1] = swap;
3317 swap = sorted_to_unsorted_gpc_map[gpc_index];
3318 sorted_to_unsorted_gpc_map[gpc_index] =
3319 sorted_to_unsorted_gpc_map[gpc_index + 1];
3320 sorted_to_unsorted_gpc_map[gpc_index + 1] = swap;
3325 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3326 if (gr->gpc_tpc_count[gpc_index] > max_tpc_count)
3327 max_tpc_count = gr->gpc_tpc_count[gpc_index];
3329 mul_factor = gr->gpc_count * max_tpc_count;
3330 if (mul_factor & 0x1)
3335 comm_denom = gr->gpc_count * max_tpc_count * mul_factor;
3337 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3338 num_tpc = sorted_num_tpcs[gpc_index];
3340 init_frac[gpc_index] = num_tpc * gr->gpc_count * mul_factor;
3343 init_err[gpc_index] = gpc_index * max_tpc_count * mul_factor - comm_denom/2;
3345 init_err[gpc_index] = 0;
3347 run_err[gpc_index] = init_frac[gpc_index] + init_err[gpc_index];
3350 while (gpc_mark < gr->tpc_count) {
3351 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3352 if ((run_err[gpc_index] * 2) >= comm_denom) {
3353 gr->map_tiles[gpc_mark++] = (u8)sorted_to_unsorted_gpc_map[gpc_index];
3354 run_err[gpc_index] += init_frac[gpc_index] - comm_denom;
3356 run_err[gpc_index] += init_frac[gpc_index];
3365 kfree(sorted_num_tpcs);
3366 kfree(sorted_to_unsorted_gpc_map);
3369 gk20a_err(dev_from_gk20a(g), "fail");
3371 gk20a_dbg_fn("done");
3376 static int gr_gk20a_init_zcull(struct gk20a *g, struct gr_gk20a *gr)
3378 struct gr_zcull_gk20a *zcull = &gr->zcull;
3380 zcull->aliquot_width = gr->tpc_count * 16;
3381 zcull->aliquot_height = 16;
3383 zcull->width_align_pixels = gr->tpc_count * 16;
3384 zcull->height_align_pixels = 32;
3386 zcull->aliquot_size =
3387 zcull->aliquot_width * zcull->aliquot_height;
3389 /* assume no floor sweeping since we only have 1 tpc in 1 gpc */
3390 zcull->pixel_squares_by_aliquots =
3391 gr->zcb_count * 16 * 16 * gr->tpc_count /
3392 (gr->gpc_count * gr->gpc_tpc_count[0]);
3394 zcull->total_aliquots =
3395 gr_gpc0_zcull_total_ram_size_num_aliquots_f(
3396 gk20a_readl(g, gr_gpc0_zcull_total_ram_size_r()));
3401 u32 gr_gk20a_get_ctxsw_zcull_size(struct gk20a *g, struct gr_gk20a *gr)
3403 /* assuming gr has already been initialized */
3404 return gr->ctx_vars.zcull_ctxsw_image_size;
3407 int gr_gk20a_bind_ctxsw_zcull(struct gk20a *g, struct gr_gk20a *gr,
3408 struct channel_gk20a *c, u64 zcull_va, u32 mode)
3410 struct zcull_ctx_desc *zcull_ctx = &c->ch_ctx.zcull_ctx;
3412 zcull_ctx->ctx_sw_mode = mode;
3413 zcull_ctx->gpu_va = zcull_va;
3415 /* TBD: don't disable channel in sw method processing */
3416 return gr_gk20a_ctx_zcull_setup(g, c, true);
3419 int gr_gk20a_get_zcull_info(struct gk20a *g, struct gr_gk20a *gr,
3420 struct gr_zcull_info *zcull_params)
3422 struct gr_zcull_gk20a *zcull = &gr->zcull;
3424 zcull_params->width_align_pixels = zcull->width_align_pixels;
3425 zcull_params->height_align_pixels = zcull->height_align_pixels;
3426 zcull_params->pixel_squares_by_aliquots =
3427 zcull->pixel_squares_by_aliquots;
3428 zcull_params->aliquot_total = zcull->total_aliquots;
3430 zcull_params->region_byte_multiplier =
3431 gr->gpc_count * gr_zcull_bytes_per_aliquot_per_gpu_v();
3432 zcull_params->region_header_size =
3433 proj_scal_litter_num_gpcs_v() *
3434 gr_zcull_save_restore_header_bytes_per_gpc_v();
3436 zcull_params->subregion_header_size =
3437 proj_scal_litter_num_gpcs_v() *
3438 gr_zcull_save_restore_subregion_header_bytes_per_gpc_v();
3440 zcull_params->subregion_width_align_pixels =
3441 gr->tpc_count * gr_gpc0_zcull_zcsize_width_subregion__multiple_v();
3442 zcull_params->subregion_height_align_pixels =
3443 gr_gpc0_zcull_zcsize_height_subregion__multiple_v();
3444 zcull_params->subregion_count = gr_zcull_subregion_qty_v();
3449 static void gr_gk20a_detect_sm_arch(struct gk20a *g)
3451 u32 v = gk20a_readl(g, gr_gpc0_tpc0_sm_arch_r());
3453 u32 raw_version = gr_gpc0_tpc0_sm_arch_spa_version_v(v);
3456 if (raw_version == gr_gpc0_tpc0_sm_arch_spa_version_smkepler_lp_v())
3457 version = 0x320; /* SM 3.2 */
3459 gk20a_err(dev_from_gk20a(g), "Unknown SM version 0x%x\n",
3462 /* on Kepler, SM version == SPA version */
3463 g->gpu_characteristics.sm_arch_spa_version = version;
3464 g->gpu_characteristics.sm_arch_sm_version = version;
3466 g->gpu_characteristics.sm_arch_warp_count =
3467 gr_gpc0_tpc0_sm_arch_warp_count_v(v);
3470 int gr_gk20a_add_zbc_color(struct gk20a *g, struct gr_gk20a *gr,
3471 struct zbc_entry *color_val, u32 index)
3475 /* update l2 table */
3476 g->ops.ltc.set_zbc_color_entry(g, color_val, index);
3478 /* update ds table */
3479 gk20a_writel(g, gr_ds_zbc_color_r_r(),
3480 gr_ds_zbc_color_r_val_f(color_val->color_ds[0]));
3481 gk20a_writel(g, gr_ds_zbc_color_g_r(),
3482 gr_ds_zbc_color_g_val_f(color_val->color_ds[1]));
3483 gk20a_writel(g, gr_ds_zbc_color_b_r(),
3484 gr_ds_zbc_color_b_val_f(color_val->color_ds[2]));
3485 gk20a_writel(g, gr_ds_zbc_color_a_r(),
3486 gr_ds_zbc_color_a_val_f(color_val->color_ds[3]));
3488 gk20a_writel(g, gr_ds_zbc_color_fmt_r(),
3489 gr_ds_zbc_color_fmt_val_f(color_val->format));
3491 gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3492 gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
3494 /* trigger the write */
3495 gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3496 gr_ds_zbc_tbl_ld_select_c_f() |
3497 gr_ds_zbc_tbl_ld_action_write_f() |
3498 gr_ds_zbc_tbl_ld_trigger_active_f());
3500 /* update local copy */
3501 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3502 gr->zbc_col_tbl[index].color_l2[i] = color_val->color_l2[i];
3503 gr->zbc_col_tbl[index].color_ds[i] = color_val->color_ds[i];
3505 gr->zbc_col_tbl[index].format = color_val->format;
3506 gr->zbc_col_tbl[index].ref_cnt++;
3511 int gr_gk20a_add_zbc_depth(struct gk20a *g, struct gr_gk20a *gr,
3512 struct zbc_entry *depth_val, u32 index)
3514 /* update l2 table */
3515 g->ops.ltc.set_zbc_depth_entry(g, depth_val, index);
3517 /* update ds table */
3518 gk20a_writel(g, gr_ds_zbc_z_r(),
3519 gr_ds_zbc_z_val_f(depth_val->depth));
3521 gk20a_writel(g, gr_ds_zbc_z_fmt_r(),
3522 gr_ds_zbc_z_fmt_val_f(depth_val->format));
3524 gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3525 gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
3527 /* trigger the write */
3528 gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3529 gr_ds_zbc_tbl_ld_select_z_f() |
3530 gr_ds_zbc_tbl_ld_action_write_f() |
3531 gr_ds_zbc_tbl_ld_trigger_active_f());
3533 /* update local copy */
3534 gr->zbc_dep_tbl[index].depth = depth_val->depth;
3535 gr->zbc_dep_tbl[index].format = depth_val->format;
3536 gr->zbc_dep_tbl[index].ref_cnt++;
3541 int gr_gk20a_add_zbc(struct gk20a *g, struct gr_gk20a *gr,
3542 struct zbc_entry *zbc_val)
3544 struct zbc_color_table *c_tbl;
3545 struct zbc_depth_table *d_tbl;
3546 u32 i, ret = -ENOMEM;
3550 /* no endian swap ? */
3552 mutex_lock(&gr->zbc_lock);
3553 switch (zbc_val->type) {
3554 case GK20A_ZBC_TYPE_COLOR:
3555 /* search existing tables */
3556 for (i = 0; i < gr->max_used_color_index; i++) {
3558 c_tbl = &gr->zbc_col_tbl[i];
3560 if (c_tbl->ref_cnt && c_tbl->format == zbc_val->format &&
3561 memcmp(c_tbl->color_ds, zbc_val->color_ds,
3562 sizeof(zbc_val->color_ds)) == 0) {
3564 if (memcmp(c_tbl->color_l2, zbc_val->color_l2,
3565 sizeof(zbc_val->color_l2))) {
3566 gk20a_err(dev_from_gk20a(g),
3567 "zbc l2 and ds color don't match with existing entries");
3579 gr->max_used_color_index < GK20A_ZBC_TABLE_SIZE) {
3582 &gr->zbc_col_tbl[gr->max_used_color_index];
3583 WARN_ON(c_tbl->ref_cnt != 0);
3585 ret = g->ops.gr.add_zbc_color(g, gr,
3586 zbc_val, gr->max_used_color_index);
3589 gr->max_used_color_index++;
3592 case GK20A_ZBC_TYPE_DEPTH:
3593 /* search existing tables */
3594 for (i = 0; i < gr->max_used_depth_index; i++) {
3596 d_tbl = &gr->zbc_dep_tbl[i];
3598 if (d_tbl->ref_cnt &&
3599 d_tbl->depth == zbc_val->depth &&
3600 d_tbl->format == zbc_val->format) {
3609 gr->max_used_depth_index < GK20A_ZBC_TABLE_SIZE) {
3612 &gr->zbc_dep_tbl[gr->max_used_depth_index];
3613 WARN_ON(d_tbl->ref_cnt != 0);
3615 ret = g->ops.gr.add_zbc_depth(g, gr,
3616 zbc_val, gr->max_used_depth_index);
3619 gr->max_used_depth_index++;
3623 gk20a_err(dev_from_gk20a(g),
3624 "invalid zbc table type %d", zbc_val->type);
3629 if (!added && ret == 0) {
3630 /* update zbc for elpg only when new entry is added */
3631 entries = max(gr->max_used_color_index,
3632 gr->max_used_depth_index);
3633 gk20a_pmu_save_zbc(g, entries);
3637 mutex_unlock(&gr->zbc_lock);
3641 /* get a zbc table entry specified by index
3642 * return table size when type is invalid */
3643 int gr_gk20a_query_zbc(struct gk20a *g, struct gr_gk20a *gr,
3644 struct zbc_query_params *query_params)
3646 u32 index = query_params->index_size;
3649 switch (query_params->type) {
3650 case GK20A_ZBC_TYPE_INVALID:
3651 query_params->index_size = GK20A_ZBC_TABLE_SIZE;
3653 case GK20A_ZBC_TYPE_COLOR:
3654 if (index >= GK20A_ZBC_TABLE_SIZE) {
3655 gk20a_err(dev_from_gk20a(g),
3656 "invalid zbc color table index\n");
3659 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3660 query_params->color_l2[i] =
3661 gr->zbc_col_tbl[index].color_l2[i];
3662 query_params->color_ds[i] =
3663 gr->zbc_col_tbl[index].color_ds[i];
3665 query_params->format = gr->zbc_col_tbl[index].format;
3666 query_params->ref_cnt = gr->zbc_col_tbl[index].ref_cnt;
3668 case GK20A_ZBC_TYPE_DEPTH:
3669 if (index >= GK20A_ZBC_TABLE_SIZE) {
3670 gk20a_err(dev_from_gk20a(g),
3671 "invalid zbc depth table index\n");
3674 query_params->depth = gr->zbc_dep_tbl[index].depth;
3675 query_params->format = gr->zbc_dep_tbl[index].format;
3676 query_params->ref_cnt = gr->zbc_dep_tbl[index].ref_cnt;
3679 gk20a_err(dev_from_gk20a(g),
3680 "invalid zbc table type\n");
3687 static int gr_gk20a_load_zbc_table(struct gk20a *g, struct gr_gk20a *gr)
3691 for (i = 0; i < gr->max_used_color_index; i++) {
3692 struct zbc_color_table *c_tbl = &gr->zbc_col_tbl[i];
3693 struct zbc_entry zbc_val;
3695 zbc_val.type = GK20A_ZBC_TYPE_COLOR;
3696 memcpy(zbc_val.color_ds,
3697 c_tbl->color_ds, sizeof(zbc_val.color_ds));
3698 memcpy(zbc_val.color_l2,
3699 c_tbl->color_l2, sizeof(zbc_val.color_l2));
3700 zbc_val.format = c_tbl->format;
3702 ret = g->ops.gr.add_zbc_color(g, gr, &zbc_val, i);
3707 for (i = 0; i < gr->max_used_depth_index; i++) {
3708 struct zbc_depth_table *d_tbl = &gr->zbc_dep_tbl[i];
3709 struct zbc_entry zbc_val;
3711 zbc_val.type = GK20A_ZBC_TYPE_DEPTH;
3712 zbc_val.depth = d_tbl->depth;
3713 zbc_val.format = d_tbl->format;
3715 ret = g->ops.gr.add_zbc_depth(g, gr, &zbc_val, i);
3722 int gr_gk20a_load_zbc_default_table(struct gk20a *g, struct gr_gk20a *gr)
3724 struct zbc_entry zbc_val;
3727 mutex_init(&gr->zbc_lock);
3729 /* load default color table */
3730 zbc_val.type = GK20A_ZBC_TYPE_COLOR;
3732 /* Opaque black (i.e. solid black, fmt 0x28 = A8B8G8R8) */
3733 zbc_val.format = gr_ds_zbc_color_fmt_val_a8_b8_g8_r8_v();
3734 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3735 zbc_val.color_ds[i] = 0;
3736 zbc_val.color_l2[i] = 0;
3738 zbc_val.color_l2[0] = 0xff000000;
3739 zbc_val.color_ds[3] = 0x3f800000;
3740 err = gr_gk20a_add_zbc(g, gr, &zbc_val);
3742 /* Transparent black = (fmt 1 = zero) */
3743 zbc_val.format = gr_ds_zbc_color_fmt_val_zero_v();
3744 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3745 zbc_val.color_ds[i] = 0;
3746 zbc_val.color_l2[i] = 0;
3748 err = gr_gk20a_add_zbc(g, gr, &zbc_val);
3750 /* Opaque white (i.e. solid white) = (fmt 2 = uniform 1) */
3751 zbc_val.format = gr_ds_zbc_color_fmt_val_unorm_one_v();
3752 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3753 zbc_val.color_ds[i] = 0x3f800000;
3754 zbc_val.color_l2[i] = 0xffffffff;
3756 err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3759 gr->max_default_color_index = 3;
3761 gk20a_err(dev_from_gk20a(g),
3762 "fail to load default zbc color table\n");
3766 /* load default depth table */
3767 zbc_val.type = GK20A_ZBC_TYPE_DEPTH;
3769 zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
3770 zbc_val.depth = 0x3f800000;
3771 err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3773 zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
3775 err = gr_gk20a_add_zbc(g, gr, &zbc_val);
3778 gr->max_default_depth_index = 2;
3780 gk20a_err(dev_from_gk20a(g),
3781 "fail to load default zbc depth table\n");
3788 int gk20a_gr_zbc_set_table(struct gk20a *g, struct gr_gk20a *gr,
3789 struct zbc_entry *zbc_val)
3793 return gr_gk20a_elpg_protected_call(g,
3794 gr_gk20a_add_zbc(g, gr, zbc_val));
3797 void gr_gk20a_init_blcg_mode(struct gk20a *g, u32 mode, u32 engine)
3801 gate_ctrl = gk20a_readl(g, therm_gate_ctrl_r(engine));
3805 gate_ctrl = set_field(gate_ctrl,
3806 therm_gate_ctrl_blk_clk_m(),
3807 therm_gate_ctrl_blk_clk_run_f());
3810 gate_ctrl = set_field(gate_ctrl,
3811 therm_gate_ctrl_blk_clk_m(),
3812 therm_gate_ctrl_blk_clk_auto_f());
3815 gk20a_err(dev_from_gk20a(g),
3816 "invalid blcg mode %d", mode);
3820 gk20a_writel(g, therm_gate_ctrl_r(engine), gate_ctrl);
3823 void gr_gk20a_init_elcg_mode(struct gk20a *g, u32 mode, u32 engine)
3825 u32 gate_ctrl, idle_filter;
3827 gate_ctrl = gk20a_readl(g, therm_gate_ctrl_r(engine));
3831 gate_ctrl = set_field(gate_ctrl,
3832 therm_gate_ctrl_eng_clk_m(),
3833 therm_gate_ctrl_eng_clk_run_f());
3834 gate_ctrl = set_field(gate_ctrl,
3835 therm_gate_ctrl_eng_pwr_m(),
3836 /* set elpg to auto to meet hw expectation */
3837 therm_gate_ctrl_eng_pwr_auto_f());
3840 gate_ctrl = set_field(gate_ctrl,
3841 therm_gate_ctrl_eng_clk_m(),
3842 therm_gate_ctrl_eng_clk_stop_f());
3845 gate_ctrl = set_field(gate_ctrl,
3846 therm_gate_ctrl_eng_clk_m(),
3847 therm_gate_ctrl_eng_clk_auto_f());
3850 gk20a_err(dev_from_gk20a(g),
3851 "invalid elcg mode %d", mode);
3854 if (tegra_platform_is_linsim()) {
3855 gate_ctrl = set_field(gate_ctrl,
3856 therm_gate_ctrl_eng_delay_after_m(),
3857 therm_gate_ctrl_eng_delay_after_f(4));
3860 /* 2 * (1 << 9) = 1024 clks */
3861 gate_ctrl = set_field(gate_ctrl,
3862 therm_gate_ctrl_eng_idle_filt_exp_m(),
3863 therm_gate_ctrl_eng_idle_filt_exp_f(9));
3864 gate_ctrl = set_field(gate_ctrl,
3865 therm_gate_ctrl_eng_idle_filt_mant_m(),
3866 therm_gate_ctrl_eng_idle_filt_mant_f(2));
3867 gk20a_writel(g, therm_gate_ctrl_r(engine), gate_ctrl);
3869 /* default fecs_idle_filter to 0 */
3870 idle_filter = gk20a_readl(g, therm_fecs_idle_filter_r());
3871 idle_filter &= ~therm_fecs_idle_filter_value_m();
3872 gk20a_writel(g, therm_fecs_idle_filter_r(), idle_filter);
3873 /* default hubmmu_idle_filter to 0 */
3874 idle_filter = gk20a_readl(g, therm_hubmmu_idle_filter_r());
3875 idle_filter &= ~therm_hubmmu_idle_filter_value_m();
3876 gk20a_writel(g, therm_hubmmu_idle_filter_r(), idle_filter);
3879 static int gr_gk20a_zcull_init_hw(struct gk20a *g, struct gr_gk20a *gr)
3881 u32 gpc_index, gpc_tpc_count, gpc_zcull_count;
3882 u32 *zcull_map_tiles, *zcull_bank_counters;
3886 bool floorsweep = false;
3891 zcull_map_tiles = kzalloc(proj_scal_max_gpcs_v() *
3892 proj_scal_max_tpc_per_gpc_v() * sizeof(u32), GFP_KERNEL);
3893 if (!zcull_map_tiles) {
3894 gk20a_err(dev_from_gk20a(g),
3895 "failed to allocate zcull temp buffers");
3898 zcull_bank_counters = kzalloc(proj_scal_max_gpcs_v() *
3899 proj_scal_max_tpc_per_gpc_v() * sizeof(u32), GFP_KERNEL);
3901 if (!zcull_bank_counters) {
3902 gk20a_err(dev_from_gk20a(g),
3903 "failed to allocate zcull temp buffers");
3904 kfree(zcull_map_tiles);
3908 for (map_counter = 0; map_counter < gr->tpc_count; map_counter++) {
3909 zcull_map_tiles[map_counter] =
3910 zcull_bank_counters[gr->map_tiles[map_counter]];
3911 zcull_bank_counters[gr->map_tiles[map_counter]]++;
3914 gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map0_r(),
3915 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_0_f(zcull_map_tiles[0]) |
3916 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_1_f(zcull_map_tiles[1]) |
3917 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_2_f(zcull_map_tiles[2]) |
3918 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_3_f(zcull_map_tiles[3]) |
3919 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_4_f(zcull_map_tiles[4]) |
3920 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_5_f(zcull_map_tiles[5]) |
3921 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_6_f(zcull_map_tiles[6]) |
3922 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_7_f(zcull_map_tiles[7]));
3924 gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map1_r(),
3925 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_8_f(zcull_map_tiles[8]) |
3926 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_9_f(zcull_map_tiles[9]) |
3927 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_10_f(zcull_map_tiles[10]) |
3928 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_11_f(zcull_map_tiles[11]) |
3929 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_12_f(zcull_map_tiles[12]) |
3930 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_13_f(zcull_map_tiles[13]) |
3931 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_14_f(zcull_map_tiles[14]) |
3932 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_15_f(zcull_map_tiles[15]));
3934 gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map2_r(),
3935 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_16_f(zcull_map_tiles[16]) |
3936 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_17_f(zcull_map_tiles[17]) |
3937 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_18_f(zcull_map_tiles[18]) |
3938 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_19_f(zcull_map_tiles[19]) |
3939 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_20_f(zcull_map_tiles[20]) |
3940 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_21_f(zcull_map_tiles[21]) |
3941 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_22_f(zcull_map_tiles[22]) |
3942 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_f(zcull_map_tiles[23]));
3944 gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map3_r(),
3945 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_24_f(zcull_map_tiles[24]) |
3946 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_25_f(zcull_map_tiles[25]) |
3947 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_26_f(zcull_map_tiles[26]) |
3948 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_27_f(zcull_map_tiles[27]) |
3949 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_28_f(zcull_map_tiles[28]) |
3950 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_29_f(zcull_map_tiles[29]) |
3951 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_30_f(zcull_map_tiles[30]) |
3952 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_31_f(zcull_map_tiles[31]));
3954 kfree(zcull_map_tiles);
3955 kfree(zcull_bank_counters);
3957 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3958 gpc_tpc_count = gr->gpc_tpc_count[gpc_index];
3959 gpc_zcull_count = gr->gpc_zcb_count[gpc_index];
3961 if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
3962 gpc_zcull_count < gpc_tpc_count) {
3963 gk20a_err(dev_from_gk20a(g),
3964 "zcull_banks (%d) less than tpcs (%d) for gpc (%d)",
3965 gpc_zcull_count, gpc_tpc_count, gpc_index);
3968 if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
3969 gpc_zcull_count != 0)
3973 /* ceil(1.0f / SM_NUM * gr_gpc0_zcull_sm_num_rcp_conservative__max_v()) */
3974 rcp_conserv = DIV_ROUND_UP(gr_gpc0_zcull_sm_num_rcp_conservative__max_v(),
3975 gr->gpc_tpc_count[0]);
3977 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3978 offset = gpc_index * proj_gpc_stride_v();
3981 gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
3982 gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
3983 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
3984 gr->max_zcull_per_gpc_count));
3986 gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
3987 gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
3988 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
3989 gr->gpc_tpc_count[gpc_index]));
3992 gk20a_writel(g, gr_gpc0_zcull_fs_r() + offset,
3993 gr_gpc0_zcull_fs_num_active_banks_f(gr->gpc_zcb_count[gpc_index]) |
3994 gr_gpc0_zcull_fs_num_sms_f(gr->tpc_count));
3996 gk20a_writel(g, gr_gpc0_zcull_sm_num_rcp_r() + offset,
3997 gr_gpc0_zcull_sm_num_rcp_conservative_f(rcp_conserv));
4000 gk20a_writel(g, gr_gpcs_ppcs_wwdx_sm_num_rcp_r(),
4001 gr_gpcs_ppcs_wwdx_sm_num_rcp_conservative_f(rcp_conserv));
4006 static void gk20a_gr_enable_gpc_exceptions(struct gk20a *g)
4008 struct gr_gk20a *gr = &g->gr;
4011 gk20a_writel(g, gr_gpcs_tpcs_tpccs_tpc_exception_en_r(),
4012 gr_gpcs_tpcs_tpccs_tpc_exception_en_sm_enabled_f());
4015 gr_gpcs_gpccs_gpc_exception_en_tpc_f((1 << gr->tpc_count) - 1);
4017 gk20a_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(), tpc_mask);
4021 void gr_gk20a_enable_hww_exceptions(struct gk20a *g)
4023 /* enable exceptions */
4024 gk20a_writel(g, gr_fe_hww_esr_r(),
4025 gr_fe_hww_esr_en_enable_f() |
4026 gr_fe_hww_esr_reset_active_f());
4027 gk20a_writel(g, gr_memfmt_hww_esr_r(),
4028 gr_memfmt_hww_esr_en_enable_f() |
4029 gr_memfmt_hww_esr_reset_active_f());
4032 static void gr_gk20a_set_hww_esr_report_mask(struct gk20a *g)
4034 /* setup sm warp esr report masks */
4035 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
4036 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f() |
4037 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f() |
4038 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f() |
4039 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f() |
4040 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f() |
4041 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f() |
4042 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f() |
4043 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f() |
4044 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() |
4045 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f() |
4046 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f() |
4047 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f() |
4048 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f() |
4049 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f() |
4050 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f() |
4051 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f() |
4052 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f() |
4053 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() |
4054 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f() |
4055 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f());
4057 /* setup sm global esr report mask */
4058 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
4059 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f() |
4060 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f() |
4061 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f() |
4062 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f() |
4063 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f() |
4064 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f() |
4065 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f());
4068 static int gk20a_init_gr_setup_hw(struct gk20a *g)
4070 struct gr_gk20a *gr = &g->gr;
4071 struct aiv_list_gk20a *sw_ctx_load = &g->gr.ctx_vars.sw_ctx_load;
4072 struct av_list_gk20a *sw_method_init = &g->gr.ctx_vars.sw_method_init;
4075 unsigned long end_jiffies = jiffies +
4076 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
4077 u32 fe_go_idle_timeout_save;
4078 u32 last_method_data = 0;
4083 /* init mmu debug buffer */
4084 addr = g->ops.mm.get_iova_addr(g, gr->mmu_wr_mem.sgt->sgl, 0);
4085 addr >>= fb_mmu_debug_wr_addr_alignment_v();
4087 gk20a_writel(g, fb_mmu_debug_wr_r(),
4088 fb_mmu_debug_wr_aperture_vid_mem_f() |
4089 fb_mmu_debug_wr_vol_false_f() |
4090 fb_mmu_debug_wr_addr_f(addr));
4092 addr = g->ops.mm.get_iova_addr(g, gr->mmu_rd_mem.sgt->sgl, 0);
4093 addr >>= fb_mmu_debug_rd_addr_alignment_v();
4095 gk20a_writel(g, fb_mmu_debug_rd_r(),
4096 fb_mmu_debug_rd_aperture_vid_mem_f() |
4097 fb_mmu_debug_rd_vol_false_f() |
4098 fb_mmu_debug_rd_addr_f(addr));
4100 if (g->ops.gr.init_gpc_mmu)
4101 g->ops.gr.init_gpc_mmu(g);
4103 /* load gr floorsweeping registers */
4104 data = gk20a_readl(g, gr_gpc0_ppc0_pes_vsc_strem_r());
4105 data = set_field(data, gr_gpc0_ppc0_pes_vsc_strem_master_pe_m(),
4106 gr_gpc0_ppc0_pes_vsc_strem_master_pe_true_f());
4107 gk20a_writel(g, gr_gpc0_ppc0_pes_vsc_strem_r(), data);
4109 gr_gk20a_zcull_init_hw(g, gr);
4111 /* Bug 1340570: increase the clock timeout to avoid potential
4112 * operation failure at high gpcclk rate. Default values are 0x400.
4114 gk20a_writel(g, pri_ringstation_sys_master_config_r(0x15), 0x800);
4115 gk20a_writel(g, pri_ringstation_gpc_master_config_r(0xa), 0x800);
4116 gk20a_writel(g, pri_ringstation_fbp_master_config_r(0x8), 0x800);
4118 /* enable fifo access */
4119 gk20a_writel(g, gr_gpfifo_ctl_r(),
4120 gr_gpfifo_ctl_access_enabled_f() |
4121 gr_gpfifo_ctl_semaphore_access_enabled_f());
4123 /* TBD: reload gr ucode when needed */
4125 /* enable interrupts */
4126 gk20a_writel(g, gr_intr_r(), 0xFFFFFFFF);
4127 gk20a_writel(g, gr_intr_en_r(), 0xFFFFFFFF);
4129 /* enable fecs error interrupts */
4130 gk20a_writel(g, gr_fecs_host_int_enable_r(),
4131 gr_fecs_host_int_enable_fault_during_ctxsw_enable_f() |
4132 gr_fecs_host_int_enable_umimp_firmware_method_enable_f() |
4133 gr_fecs_host_int_enable_umimp_illegal_method_enable_f() |
4134 gr_fecs_host_int_enable_watchdog_enable_f());
4136 g->ops.gr.enable_hww_exceptions(g);
4137 g->ops.gr.set_hww_esr_report_mask(g);
4139 /* enable TPC exceptions per GPC */
4140 gk20a_gr_enable_gpc_exceptions(g);
4142 /* TBD: ECC for L1/SM */
4143 /* TBD: enable per BE exceptions */
4145 /* reset and enable all exceptions */
4146 gk20a_writel(g, gr_exception_r(), 0xFFFFFFFF);
4147 gk20a_writel(g, gr_exception_en_r(), 0xFFFFFFFF);
4148 gk20a_writel(g, gr_exception1_r(), 0xFFFFFFFF);
4149 gk20a_writel(g, gr_exception1_en_r(), 0xFFFFFFFF);
4150 gk20a_writel(g, gr_exception2_r(), 0xFFFFFFFF);
4151 gk20a_writel(g, gr_exception2_en_r(), 0xFFFFFFFF);
4153 gr_gk20a_load_zbc_table(g, gr);
4155 g->ops.ltc.init_cbc(g, gr);
4158 for (i = 0; i < sw_ctx_load->count; i++)
4159 gk20a_writel(g, sw_ctx_load->l[i].addr,
4160 sw_ctx_load->l[i].value);
4162 err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4166 /* save and disable fe_go_idle */
4167 fe_go_idle_timeout_save =
4168 gk20a_readl(g, gr_fe_go_idle_timeout_r());
4169 gk20a_writel(g, gr_fe_go_idle_timeout_r(),
4170 (fe_go_idle_timeout_save & gr_fe_go_idle_timeout_count_f(0)) |
4171 gr_fe_go_idle_timeout_count_disabled_f());
4173 /* override a few ctx state registers */
4174 g->ops.gr.commit_global_cb_manager(g, NULL, false);
4175 gr_gk20a_commit_global_timeslice(g, NULL, false);
4177 /* floorsweep anything left */
4178 g->ops.gr.init_fs_state(g);
4180 err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4182 goto restore_fe_go_idle;
4185 /* restore fe_go_idle */
4186 gk20a_writel(g, gr_fe_go_idle_timeout_r(), fe_go_idle_timeout_save);
4188 if (err || gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT))
4191 /* load method init */
4192 if (sw_method_init->count) {
4193 gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
4194 sw_method_init->l[0].value);
4195 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
4196 gr_pri_mme_shadow_raw_index_write_trigger_f() |
4197 sw_method_init->l[0].addr);
4198 last_method_data = sw_method_init->l[0].value;
4200 for (i = 1; i < sw_method_init->count; i++) {
4201 if (sw_method_init->l[i].value != last_method_data) {
4202 gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
4203 sw_method_init->l[i].value);
4204 last_method_data = sw_method_init->l[i].value;
4206 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
4207 gr_pri_mme_shadow_raw_index_write_trigger_f() |
4208 sw_method_init->l[i].addr);
4211 err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4216 gk20a_dbg_fn("done");
4220 static void gr_gk20a_load_gating_prod(struct gk20a *g)
4222 /* slcg prod values */
4223 g->ops.clock_gating.slcg_gr_load_gating_prod(g, g->slcg_enabled);
4224 if (g->ops.clock_gating.slcg_ctxsw_firmware_load_gating_prod)
4225 g->ops.clock_gating.slcg_ctxsw_firmware_load_gating_prod(g,
4227 g->ops.clock_gating.slcg_perf_load_gating_prod(g, g->slcg_enabled);
4229 g->ops.clock_gating.blcg_gr_load_gating_prod(g, g->blcg_enabled);
4230 if (g->ops.clock_gating.blcg_ctxsw_firmware_load_gating_prod)
4231 g->ops.clock_gating.blcg_ctxsw_firmware_load_gating_prod(g,
4233 g->ops.clock_gating.pg_gr_load_gating_prod(g, true);
4236 static int gk20a_init_gr_prepare(struct gk20a *g)
4238 u32 gpfifo_ctrl, pmc_en;
4241 /* disable fifo access */
4242 pmc_en = gk20a_readl(g, mc_enable_r());
4243 if (pmc_en & mc_enable_pgraph_enabled_f()) {
4244 gpfifo_ctrl = gk20a_readl(g, gr_gpfifo_ctl_r());
4245 gpfifo_ctrl &= ~gr_gpfifo_ctl_access_enabled_f();
4246 gk20a_writel(g, gr_gpfifo_ctl_r(), gpfifo_ctrl);
4249 /* reset gr engine */
4250 gk20a_reset(g, mc_enable_pgraph_enabled_f()
4251 | mc_enable_blg_enabled_f()
4252 | mc_enable_perfmon_enabled_f());
4254 gr_gk20a_load_gating_prod(g);
4255 /* Disable elcg until it gets enabled later in the init*/
4256 gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_GR_GK20A);
4257 gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_CE2_GK20A);
4259 /* enable fifo access */
4260 gk20a_writel(g, gr_gpfifo_ctl_r(),
4261 gr_gpfifo_ctl_access_enabled_f() |
4262 gr_gpfifo_ctl_semaphore_access_enabled_f());
4264 if (!g->gr.ctx_vars.valid) {
4265 err = gr_gk20a_init_ctx_vars(g, &g->gr);
4267 gk20a_err(dev_from_gk20a(g),
4268 "fail to load gr init ctx");
4273 static int gr_gk20a_wait_mem_scrubbing(struct gk20a *g)
4275 int retries = GR_IDLE_CHECK_MAX / GR_IDLE_CHECK_DEFAULT;
4276 bool fecs_scrubbing;
4277 bool gpccs_scrubbing;
4282 fecs_scrubbing = gk20a_readl(g, gr_fecs_dmactl_r()) &
4283 (gr_fecs_dmactl_imem_scrubbing_m() |
4284 gr_fecs_dmactl_dmem_scrubbing_m());
4286 gpccs_scrubbing = gk20a_readl(g, gr_gpccs_dmactl_r()) &
4287 (gr_gpccs_dmactl_imem_scrubbing_m() |
4288 gr_gpccs_dmactl_imem_scrubbing_m());
4290 if (!fecs_scrubbing && !gpccs_scrubbing) {
4291 gk20a_dbg_fn("done");
4295 udelay(GR_IDLE_CHECK_DEFAULT);
4296 } while (--retries || !tegra_platform_is_silicon());
4298 gk20a_err(dev_from_gk20a(g), "Falcon mem scrubbing timeout");
4302 static int gr_gk20a_init_ctxsw(struct gk20a *g)
4306 err = g->ops.gr.load_ctxsw_ucode(g);
4310 err = gr_gk20a_wait_ctxsw_ready(g);
4316 gk20a_err(dev_from_gk20a(g), "fail");
4318 gk20a_dbg_fn("done");
4323 static int gk20a_init_gr_reset_enable_hw(struct gk20a *g)
4325 struct av_list_gk20a *sw_non_ctx_load = &g->gr.ctx_vars.sw_non_ctx_load;
4326 unsigned long end_jiffies = jiffies +
4327 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
4332 /* enable interrupts */
4333 gk20a_writel(g, gr_intr_r(), ~0);
4334 gk20a_writel(g, gr_intr_en_r(), ~0);
4337 gk20a_writel(g, gr_scc_init_r(),
4338 gr_scc_init_ram_trigger_f());
4340 /* load non_ctx init */
4341 for (i = 0; i < sw_non_ctx_load->count; i++)
4342 gk20a_writel(g, sw_non_ctx_load->l[i].addr,
4343 sw_non_ctx_load->l[i].value);
4345 err = gr_gk20a_wait_mem_scrubbing(g);
4349 err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4355 gk20a_err(dev_from_gk20a(g), "fail");
4357 gk20a_dbg_fn("done");
4363 * XXX Merge this list with the debugger/profiler
4364 * session regops whitelists?
4366 static u32 wl_addr_gk20a[] = {
4367 /* this list must be sorted (low to high) */
4368 0x404468, /* gr_pri_mme_max_instructions */
4369 0x418800, /* gr_pri_gpcs_setup_debug */
4370 0x419a04, /* gr_pri_gpcs_tpcs_tex_lod_dbg */
4371 0x419a08, /* gr_pri_gpcs_tpcs_tex_samp_dbg */
4372 0x419e10, /* gr_pri_gpcs_tpcs_sm_dbgr_control0 */
4373 0x419f78, /* gr_pri_gpcs_tpcs_sm_disp_ctrl */
4376 static int gr_gk20a_init_access_map(struct gk20a *g)
4378 struct gr_gk20a *gr = &g->gr;
4382 DIV_ROUND_UP(gr->ctx_vars.priv_access_map_size,
4385 data = vmap(gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem.pages,
4386 PAGE_ALIGN(gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem.size) >>
4387 PAGE_SHIFT, 0, pgprot_dmacoherent(PAGE_KERNEL));
4389 gk20a_err(dev_from_gk20a(g),
4390 "failed to map priv access map memory");
4395 memset(data, 0x0, PAGE_SIZE * nr_pages);
4397 for (w = 0; w < ARRAY_SIZE(wl_addr_gk20a); w++) {
4398 u32 map_bit, map_byte, map_shift;
4399 map_bit = wl_addr_gk20a[w] >> 2;
4400 map_byte = map_bit >> 3;
4401 map_shift = map_bit & 0x7; /* i.e. 0-7 */
4402 gk20a_dbg_info("access map addr:0x%x byte:0x%x bit:%d",
4403 wl_addr_gk20a[w], map_byte, map_shift);
4404 ((u8 *)data)[map_byte] |= 1 << map_shift;
4413 static int gk20a_init_gr_setup_sw(struct gk20a *g)
4415 struct gr_gk20a *gr = &g->gr;
4421 gk20a_dbg_fn("skip init");
4427 err = gr_gk20a_init_gr_config(g, gr);
4431 err = gr_gk20a_init_mmu_sw(g, gr);
4435 err = gr_gk20a_init_map_tiles(g, gr);
4439 gk20a_dbg_info("total ram pages : %lu", totalram_pages);
4440 gr->max_comptag_mem = totalram_pages
4441 >> (10 - (PAGE_SHIFT - 10));
4442 err = g->ops.ltc.init_comptags(g, gr);
4446 err = gr_gk20a_init_zcull(g, gr);
4450 err = gr_gk20a_alloc_global_ctx_buffers(g);
4454 err = gr_gk20a_init_access_map(g);
4458 gr_gk20a_load_zbc_default_table(g, gr);
4460 mutex_init(&gr->ctx_mutex);
4461 spin_lock_init(&gr->ch_tlb_lock);
4463 gr->remove_support = gk20a_remove_gr_support;
4464 gr->sw_ready = true;
4466 gk20a_dbg_fn("done");
4470 gk20a_err(dev_from_gk20a(g), "fail");
4471 gk20a_remove_gr_support(gr);
4475 static int gk20a_init_gr_bind_fecs_elpg(struct gk20a *g)
4477 struct pmu_gk20a *pmu = &g->pmu;
4478 struct mm_gk20a *mm = &g->mm;
4479 struct vm_gk20a *vm = &mm->pmu.vm;
4480 struct device *d = dev_from_gk20a(g);
4489 err = gr_gk20a_fecs_get_reglist_img_size(g, &size);
4491 gk20a_err(dev_from_gk20a(g),
4492 "fail to query fecs pg buffer size");
4496 if (!pmu->pg_buf.cpu_va) {
4497 err = gk20a_gmmu_alloc_map(vm, size, &pmu->pg_buf);
4499 gk20a_err(d, "failed to allocate memory\n");
4505 err = gr_gk20a_fecs_set_reglist_bind_inst(g,
4506 gk20a_mem_phys(&mm->pmu.inst_block));
4508 gk20a_err(dev_from_gk20a(g),
4509 "fail to bind pmu inst to gr");
4513 err = gr_gk20a_fecs_set_reglist_virtual_addr(g, pmu->pg_buf.gpu_va);
4515 gk20a_err(dev_from_gk20a(g),
4516 "fail to set pg buffer pmu va");
4523 int gk20a_init_gr_support(struct gk20a *g)
4529 #if defined(CONFIG_GK20A_CYCLE_STATS)
4530 mutex_init(&g->gr.cs_lock);
4531 g->gr.cs_data = NULL;
4534 /* this is required before gr_gk20a_init_ctx_state */
4535 mutex_init(&g->gr.fecs_mutex);
4537 err = gr_gk20a_init_ctxsw(g);
4541 /* this appears query for sw states but fecs actually init
4542 ramchain, etc so this is hw init */
4543 err = g->ops.gr.init_ctx_state(g);
4547 err = gk20a_init_gr_setup_sw(g);
4551 err = gk20a_init_gr_setup_hw(g);
4555 err = gk20a_init_gr_bind_fecs_elpg(g);
4559 gr_gk20a_enable_elcg(g);
4560 /* GR is inialized, signal possible waiters */
4561 g->gr.initialized = true;
4562 wake_up(&g->gr.init_wq);
4567 /* Wait until GR is initialized */
4568 void gk20a_gr_wait_initialized(struct gk20a *g)
4570 wait_event(g->gr.init_wq, g->gr.initialized);
4573 #define NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE 0x02dc
4574 #define NVA297_SET_CIRCULAR_BUFFER_SIZE 0x1280
4575 #define NVA297_SET_SHADER_EXCEPTIONS 0x1528
4576 #define NVA0C0_SET_SHADER_EXCEPTIONS 0x1528
4578 #define NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE 0
4580 struct gr_isr_data {
4591 void gk20a_gr_set_shader_exceptions(struct gk20a *g, u32 data)
4595 if (data == NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE) {
4597 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(), 0);
4599 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(), 0);
4601 /* setup sm warp esr report masks */
4602 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
4603 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f() |
4604 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f() |
4605 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f() |
4606 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f() |
4607 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f() |
4608 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f() |
4609 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f() |
4610 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f() |
4611 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() |
4612 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f() |
4613 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f() |
4614 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f() |
4615 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f() |
4616 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f() |
4617 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f() |
4618 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f() |
4619 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f() |
4620 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() |
4621 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f() |
4622 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f());
4624 /* setup sm global esr report mask */
4625 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
4626 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f() |
4627 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f() |
4628 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f() |
4629 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f() |
4630 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f() |
4631 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f() |
4632 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f());
4636 static void gk20a_gr_set_circular_buffer_size(struct gk20a *g, u32 data)
4638 struct gr_gk20a *gr = &g->gr;
4639 u32 gpc_index, ppc_index, stride, val, offset;
4640 u32 cb_size = data * 4;
4644 if (cb_size > gr->attrib_cb_size)
4645 cb_size = gr->attrib_cb_size;
4647 gk20a_writel(g, gr_ds_tga_constraintlogic_r(),
4648 (gk20a_readl(g, gr_ds_tga_constraintlogic_r()) &
4649 ~gr_ds_tga_constraintlogic_beta_cbsize_f(~0)) |
4650 gr_ds_tga_constraintlogic_beta_cbsize_f(cb_size));
4652 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4653 stride = proj_gpc_stride_v() * gpc_index;
4655 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
4658 val = gk20a_readl(g, gr_gpc0_ppc0_cbm_cfg_r() +
4660 proj_ppc_in_gpc_stride_v() * ppc_index);
4662 offset = gr_gpc0_ppc0_cbm_cfg_start_offset_v(val);
4664 val = set_field(val,
4665 gr_gpc0_ppc0_cbm_cfg_size_m(),
4666 gr_gpc0_ppc0_cbm_cfg_size_f(cb_size *
4667 gr->pes_tpc_count[ppc_index][gpc_index]));
4668 val = set_field(val,
4669 gr_gpc0_ppc0_cbm_cfg_start_offset_m(),
4672 gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg_r() +
4674 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4676 val = set_field(val,
4677 gr_gpc0_ppc0_cbm_cfg_start_offset_m(),
4680 gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg_r() +
4682 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4687 static void gk20a_gr_set_alpha_circular_buffer_size(struct gk20a *g, u32 data)
4689 struct gr_gk20a *gr = &g->gr;
4690 u32 gpc_index, ppc_index, stride, val;
4691 u32 pd_ab_max_output;
4692 u32 alpha_cb_size = data * 4;
4695 /* if (NO_ALPHA_BETA_TIMESLICE_SUPPORT_DEF)
4698 if (alpha_cb_size > gr->alpha_cb_size)
4699 alpha_cb_size = gr->alpha_cb_size;
4701 gk20a_writel(g, gr_ds_tga_constraintlogic_r(),
4702 (gk20a_readl(g, gr_ds_tga_constraintlogic_r()) &
4703 ~gr_ds_tga_constraintlogic_alpha_cbsize_f(~0)) |
4704 gr_ds_tga_constraintlogic_alpha_cbsize_f(alpha_cb_size));
4706 pd_ab_max_output = alpha_cb_size *
4707 gr_gpc0_ppc0_cbm_cfg_size_granularity_v() /
4708 gr_pd_ab_dist_cfg1_max_output_granularity_v();
4710 gk20a_writel(g, gr_pd_ab_dist_cfg1_r(),
4711 gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output) |
4712 gr_pd_ab_dist_cfg1_max_batches_init_f());
4714 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4715 stride = proj_gpc_stride_v() * gpc_index;
4717 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
4720 val = gk20a_readl(g, gr_gpc0_ppc0_cbm_cfg2_r() +
4722 proj_ppc_in_gpc_stride_v() * ppc_index);
4724 val = set_field(val, gr_gpc0_ppc0_cbm_cfg2_size_m(),
4725 gr_gpc0_ppc0_cbm_cfg2_size_f(alpha_cb_size *
4726 gr->pes_tpc_count[ppc_index][gpc_index]));
4728 gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg2_r() +
4730 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4735 int gk20a_enable_gr_hw(struct gk20a *g)
4741 err = gk20a_init_gr_prepare(g);
4745 err = gk20a_init_gr_reset_enable_hw(g);
4749 gk20a_dbg_fn("done");
4754 static void gr_gk20a_enable_elcg(struct gk20a *g)
4756 if (g->elcg_enabled) {
4757 gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_GR_GK20A);
4758 gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_CE2_GK20A);
4760 gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_GR_GK20A);
4761 gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_CE2_GK20A);
4765 int gk20a_gr_reset(struct gk20a *g)
4770 mutex_lock(&g->gr.fecs_mutex);
4772 err = gk20a_enable_gr_hw(g);
4776 err = gk20a_init_gr_setup_hw(g);
4780 err = gr_gk20a_init_ctxsw(g);
4784 mutex_unlock(&g->gr.fecs_mutex);
4786 /* this appears query for sw states but fecs actually init
4787 ramchain, etc so this is hw init */
4788 err = g->ops.gr.init_ctx_state(g);
4793 err = gr_gk20a_fecs_get_reglist_img_size(g, &size);
4795 gk20a_err(dev_from_gk20a(g),
4796 "fail to query fecs pg buffer size");
4800 err = gr_gk20a_fecs_set_reglist_bind_inst(g,
4801 gk20a_mem_phys(&g->mm.pmu.inst_block));
4803 gk20a_err(dev_from_gk20a(g),
4804 "fail to bind pmu inst to gr");
4808 err = gr_gk20a_fecs_set_reglist_virtual_addr(g, g->pmu.pg_buf.gpu_va);
4810 gk20a_err(dev_from_gk20a(g),
4811 "fail to set pg buffer pmu va");
4815 gr_gk20a_load_gating_prod(g);
4816 gr_gk20a_enable_elcg(g);
4821 static int gr_gk20a_handle_sw_method(struct gk20a *g, u32 addr,
4822 u32 class_num, u32 offset, u32 data)
4826 trace_gr_gk20a_handle_sw_method(g->dev->name);
4828 if (class_num == KEPLER_COMPUTE_A) {
4829 switch (offset << 2) {
4830 case NVA0C0_SET_SHADER_EXCEPTIONS:
4831 gk20a_gr_set_shader_exceptions(g, data);
4838 if (class_num == KEPLER_C) {
4839 switch (offset << 2) {
4840 case NVA297_SET_SHADER_EXCEPTIONS:
4841 gk20a_gr_set_shader_exceptions(g, data);
4843 case NVA297_SET_CIRCULAR_BUFFER_SIZE:
4844 g->ops.gr.set_circular_buffer_size(g, data);
4846 case NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE:
4847 g->ops.gr.set_alpha_circular_buffer_size(g, data);
4859 static int gk20a_gr_handle_semaphore_timeout_pending(struct gk20a *g,
4860 struct gr_isr_data *isr_data)
4862 struct fifo_gk20a *f = &g->fifo;
4863 struct channel_gk20a *ch = &f->channel[isr_data->chid];
4865 gk20a_set_error_notifier(ch,
4866 NVGPU_CHANNEL_GR_SEMAPHORE_TIMEOUT);
4867 gk20a_err(dev_from_gk20a(g),
4868 "gr semaphore timeout\n");
4872 static int gk20a_gr_intr_illegal_notify_pending(struct gk20a *g,
4873 struct gr_isr_data *isr_data)
4875 struct fifo_gk20a *f = &g->fifo;
4876 struct channel_gk20a *ch = &f->channel[isr_data->chid];
4878 gk20a_set_error_notifier(ch,
4879 NVGPU_CHANNEL_GR_ILLEGAL_NOTIFY);
4880 /* This is an unrecoverable error, reset is needed */
4881 gk20a_err(dev_from_gk20a(g),
4882 "gr semaphore timeout\n");
4886 static int gk20a_gr_handle_illegal_method(struct gk20a *g,
4887 struct gr_isr_data *isr_data)
4889 int ret = g->ops.gr.handle_sw_method(g, isr_data->addr,
4890 isr_data->class_num, isr_data->offset,
4893 gk20a_err(dev_from_gk20a(g), "invalid method class 0x%08x"
4894 ", offset 0x%08x address 0x%08x\n",
4895 isr_data->class_num, isr_data->offset, isr_data->addr);
4900 static int gk20a_gr_handle_illegal_class(struct gk20a *g,
4901 struct gr_isr_data *isr_data)
4903 struct fifo_gk20a *f = &g->fifo;
4904 struct channel_gk20a *ch = &f->channel[isr_data->chid];
4906 gk20a_set_error_notifier(ch,
4907 NVGPU_CHANNEL_GR_ERROR_SW_NOTIFY);
4908 gk20a_err(dev_from_gk20a(g),
4909 "invalid class 0x%08x, offset 0x%08x",
4910 isr_data->class_num, isr_data->offset);
4914 static int gk20a_gr_handle_fecs_error(struct gk20a *g,
4915 struct gr_isr_data *isr_data)
4917 struct fifo_gk20a *f = &g->fifo;
4918 struct channel_gk20a *ch = &f->channel[isr_data->chid];
4919 u32 gr_fecs_intr = gk20a_readl(g, gr_fecs_host_int_status_r());
4922 gk20a_err(dev_from_gk20a(g),
4923 "unhandled fecs error interrupt 0x%08x for channel %u",
4924 gr_fecs_intr, ch->hw_chid);
4926 if (gr_fecs_intr & gr_fecs_host_int_status_umimp_firmware_method_f(1)) {
4927 gk20a_err(dev_from_gk20a(g),
4928 "firmware method error 0x%08x for offset 0x%04x",
4929 gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(6)),
4933 gk20a_writel(g, gr_fecs_host_int_clear_r(), gr_fecs_intr);
4937 static int gk20a_gr_handle_class_error(struct gk20a *g,
4938 struct gr_isr_data *isr_data)
4940 struct fifo_gk20a *f = &g->fifo;
4941 struct channel_gk20a *ch = &f->channel[isr_data->chid];
4942 u32 gr_class_error =
4943 gr_class_error_code_v(gk20a_readl(g, gr_class_error_r()));
4946 gk20a_set_error_notifier(ch,
4947 NVGPU_CHANNEL_GR_ERROR_SW_NOTIFY);
4948 gk20a_err(dev_from_gk20a(g),
4949 "class error 0x%08x, offset 0x%08x, unhandled intr 0x%08x for channel %u\n",
4950 isr_data->class_num, isr_data->offset,
4951 gr_class_error, ch->hw_chid);
4955 static int gk20a_gr_handle_firmware_method(struct gk20a *g,
4956 struct gr_isr_data *isr_data)
4958 struct fifo_gk20a *f = &g->fifo;
4959 struct channel_gk20a *ch = &f->channel[isr_data->chid];
4963 gk20a_set_error_notifier(ch,
4964 NVGPU_CHANNEL_GR_ERROR_SW_NOTIFY);
4965 gk20a_err(dev_from_gk20a(g),
4966 "firmware method 0x%08x, offset 0x%08x for channel %u\n",
4967 isr_data->class_num, isr_data->offset,
4972 static int gk20a_gr_handle_semaphore_pending(struct gk20a *g,
4973 struct gr_isr_data *isr_data)
4975 struct fifo_gk20a *f = &g->fifo;
4976 struct channel_gk20a *ch = &f->channel[isr_data->chid];
4978 gk20a_channel_event(ch);
4979 wake_up(&ch->semaphore_wq);
4984 #if defined(CONFIG_GK20A_CYCLE_STATS)
4985 static inline bool is_valid_cyclestats_bar0_offset_gk20a(struct gk20a *g,
4988 /* support only 24-bit 4-byte aligned offsets */
4989 bool valid = !(offset & 0xFF000003);
4994 /* whitelist check */
4996 is_bar0_global_offset_whitelisted_gk20a(g, offset);
4997 /* resource size check in case there was a problem
4998 * with allocating the assumed size of bar0 */
5000 offset < resource_size(g->reg_mem);
5005 static int gk20a_gr_handle_notify_pending(struct gk20a *g,
5006 struct gr_isr_data *isr_data)
5008 struct fifo_gk20a *f = &g->fifo;
5009 struct channel_gk20a *ch = &f->channel[isr_data->chid];
5011 #if defined(CONFIG_GK20A_CYCLE_STATS)
5012 void *virtual_address;
5017 /* GL will never use payload 0 for cycle state */
5018 if ((ch->cyclestate.cyclestate_buffer == NULL) || (isr_data->data_lo == 0))
5021 mutex_lock(&ch->cyclestate.cyclestate_buffer_mutex);
5023 virtual_address = ch->cyclestate.cyclestate_buffer;
5024 buffer_size = ch->cyclestate.cyclestate_buffer_size;
5025 offset = isr_data->data_lo;
5028 struct share_buffer_head *sh_hdr;
5029 u32 min_element_size;
5031 /* validate offset */
5032 if (offset + sizeof(struct share_buffer_head) > buffer_size ||
5033 offset + sizeof(struct share_buffer_head) < offset) {
5034 gk20a_err(dev_from_gk20a(g),
5035 "cyclestats buffer overrun at offset 0x%x\n",
5040 sh_hdr = (struct share_buffer_head *)
5041 ((char *)virtual_address + offset);
5044 (sh_hdr->operation == OP_END ?
5045 sizeof(struct share_buffer_head) :
5046 sizeof(struct gk20a_cyclestate_buffer_elem));
5048 /* validate sh_hdr->size */
5049 if (sh_hdr->size < min_element_size ||
5050 offset + sh_hdr->size > buffer_size ||
5051 offset + sh_hdr->size < offset) {
5052 gk20a_err(dev_from_gk20a(g),
5053 "bad cyclestate buffer header size at offset 0x%x\n",
5055 sh_hdr->failed = true;
5059 switch (sh_hdr->operation) {
5067 struct gk20a_cyclestate_buffer_elem *op_elem =
5068 (struct gk20a_cyclestate_buffer_elem *)sh_hdr;
5069 bool valid = is_valid_cyclestats_bar0_offset_gk20a(
5070 g, op_elem->offset_bar0);
5076 gk20a_err(dev_from_gk20a(g),
5077 "invalid cycletstats op offset: 0x%x\n",
5078 op_elem->offset_bar0);
5080 sh_hdr->failed = exit = true;
5087 (op_elem->last_bit + 1))
5089 op_elem->first_bit)-1);
5093 op_elem->offset_bar0);
5095 switch (sh_hdr->operation) {
5098 (raw_reg & mask_orig)
5099 >> op_elem->first_bit;
5104 if ((unsigned int)mask_orig !=
5107 (raw_reg & ~mask_orig);
5110 v |= ((op_elem->data
5111 << op_elem->first_bit)
5115 op_elem->offset_bar0,
5126 /* no operation content case */
5130 sh_hdr->completed = true;
5131 offset += sh_hdr->size;
5133 mutex_unlock(&ch->cyclestate.cyclestate_buffer_mutex);
5136 wake_up(&ch->notifier_wq);
5140 /* Used by sw interrupt thread to translate current ctx to chid.
5141 * Also used by regops to translate current ctx to chid and tsgid.
5142 * For performance, we don't want to go through 128 channels every time.
5143 * curr_ctx should be the value read from gr_fecs_current_ctx_r().
5144 * A small tlb is used here to cache translation.
5146 * Returned channel must be freed with gk20a_channel_put() */
5147 static struct channel_gk20a *gk20a_gr_get_channel_from_ctx(
5148 struct gk20a *g, u32 curr_ctx, int *curr_tsgid)
5150 struct fifo_gk20a *f = &g->fifo;
5151 struct gr_gk20a *gr = &g->gr;
5153 int tsgid = NVGPU_INVALID_TSG_ID;
5155 struct channel_gk20a *ret = NULL;
5157 /* when contexts are unloaded from GR, the valid bit is reset
5158 * but the instance pointer information remains intact. So the
5159 * valid bit must be checked to be absolutely certain that a
5160 * valid context is currently resident. */
5161 if (!gr_fecs_current_ctx_valid_v(curr_ctx))
5164 spin_lock(&gr->ch_tlb_lock);
5166 /* check cache first */
5167 for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
5168 if (gr->chid_tlb[i].curr_ctx == curr_ctx) {
5169 chid = gr->chid_tlb[i].hw_chid;
5170 tsgid = gr->chid_tlb[i].tsgid;
5171 ret = gk20a_channel_get(&f->channel[chid]);
5177 for (chid = 0; chid < f->num_channels; chid++) {
5178 struct channel_gk20a *ch = &f->channel[chid];
5179 if (!gk20a_channel_get(ch))
5182 if ((u32)(gk20a_mem_phys(&ch->inst_block) >>
5183 ram_in_base_shift_v()) ==
5184 gr_fecs_current_ctx_ptr_v(curr_ctx)) {
5190 gk20a_channel_put(ch);
5196 /* add to free tlb entry */
5197 for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
5198 if (gr->chid_tlb[i].curr_ctx == 0) {
5199 gr->chid_tlb[i].curr_ctx = curr_ctx;
5200 gr->chid_tlb[i].hw_chid = chid;
5201 gr->chid_tlb[i].tsgid = tsgid;
5206 /* no free entry, flush one */
5207 gr->chid_tlb[gr->channel_tlb_flush_index].curr_ctx = curr_ctx;
5208 gr->chid_tlb[gr->channel_tlb_flush_index].hw_chid = chid;
5209 gr->chid_tlb[gr->channel_tlb_flush_index].tsgid = tsgid;
5211 gr->channel_tlb_flush_index =
5212 (gr->channel_tlb_flush_index + 1) &
5213 (GR_CHANNEL_MAP_TLB_SIZE - 1);
5216 spin_unlock(&gr->ch_tlb_lock);
5218 *curr_tsgid = tsgid;
5222 int gk20a_gr_lock_down_sm(struct gk20a *g,
5223 u32 gpc, u32 tpc, u32 global_esr_mask)
5226 proj_gpc_stride_v() * gpc + proj_tpc_in_gpc_stride_v() * tpc;
5229 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5230 "GPC%d TPC%d: locking down SM", gpc, tpc);
5232 /* assert stop trigger */
5234 gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r() + offset);
5235 dbgr_control0 |= gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
5237 gr_gpc0_tpc0_sm_dbgr_control0_r() + offset, dbgr_control0);
5239 return gk20a_gr_wait_for_sm_lock_down(g, gpc, tpc, global_esr_mask,
5243 bool gk20a_gr_sm_debugger_attached(struct gk20a *g)
5245 u32 dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
5247 /* check if an sm debugger is attached.
5248 * assumption: all SMs will have debug mode enabled/disabled
5250 if (gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_v(dbgr_control0) ==
5251 gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_v())
5257 static void gk20a_gr_clear_sm_hww(struct gk20a *g,
5258 u32 gpc, u32 tpc, u32 global_esr)
5260 u32 offset = proj_gpc_stride_v() * gpc +
5261 proj_tpc_in_gpc_stride_v() * tpc;
5263 gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset,
5266 /* clear the warp hww */
5267 gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset,
5268 gr_gpc0_tpc0_sm_hww_warp_esr_error_none_f());
5271 static struct channel_gk20a *
5272 channel_from_hw_chid(struct gk20a *g, u32 hw_chid)
5274 return g->fifo.channel+hw_chid;
5277 static int gk20a_gr_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
5281 bool do_warp_sync = false;
5282 u32 offset = proj_gpc_stride_v() * gpc +
5283 proj_tpc_in_gpc_stride_v() * tpc;
5285 /* these three interrupts don't require locking down the SM. They can
5286 * be handled by usermode clients as they aren't fatal. Additionally,
5287 * usermode clients may wish to allow some warps to execute while others
5288 * are at breakpoints, as opposed to fatal errors where all warps should
5290 u32 global_mask = gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f() |
5291 gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f() |
5292 gr_gpc0_tpc0_sm_hww_global_esr_single_step_complete_pending_f();
5293 u32 global_esr, warp_esr;
5294 bool sm_debugger_attached = gk20a_gr_sm_debugger_attached(g);
5296 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
5298 global_esr = gk20a_readl(g,
5299 gr_gpc0_tpc0_sm_hww_global_esr_r() + offset);
5300 warp_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset);
5302 /* if an sm debugger is attached, disable forwarding of tpc exceptions.
5303 * the debugger will reenable exceptions after servicing them. */
5304 if (sm_debugger_attached) {
5305 u32 tpc_exception_en = gk20a_readl(g,
5306 gr_gpc0_tpc0_tpccs_tpc_exception_en_r() +
5308 tpc_exception_en &= ~gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f();
5310 gr_gpc0_tpc0_tpccs_tpc_exception_en_r() + offset,
5312 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "SM debugger attached");
5315 /* if a debugger is present and an error has occurred, do a warp sync */
5316 if (sm_debugger_attached &&
5317 ((warp_esr != 0) || ((global_esr & ~global_mask) != 0))) {
5318 gk20a_dbg(gpu_dbg_intr, "warp sync needed");
5319 do_warp_sync = true;
5323 ret = gk20a_gr_lock_down_sm(g, gpc, tpc, global_mask);
5325 gk20a_err(dev_from_gk20a(g), "sm did not lock down!\n");
5330 *post_event |= true;
5335 static int gk20a_gr_handle_tpc_exception(struct gk20a *g, u32 gpc, u32 tpc,
5339 u32 offset = proj_gpc_stride_v() * gpc +
5340 proj_tpc_in_gpc_stride_v() * tpc;
5341 u32 tpc_exception = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_r()
5344 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "");
5346 /* check if an sm exeption is pending */
5347 if (gr_gpc0_tpc0_tpccs_tpc_exception_sm_v(tpc_exception) ==
5348 gr_gpc0_tpc0_tpccs_tpc_exception_sm_pending_v()) {
5349 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5350 "GPC%d TPC%d: SM exception pending", gpc, tpc);
5351 ret = gk20a_gr_handle_sm_exception(g, gpc, tpc, post_event);
5357 static int gk20a_gr_handle_gpc_exception(struct gk20a *g, bool *post_event)
5360 u32 gpc_offset, tpc_offset, gpc, tpc;
5361 struct gr_gk20a *gr = &g->gr;
5362 u32 exception1 = gk20a_readl(g, gr_exception1_r());
5363 u32 gpc_exception, global_esr;
5365 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "");
5367 for (gpc = 0; gpc < gr->gpc_count; gpc++) {
5368 if ((exception1 & (1 << gpc)) == 0)
5371 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5372 "GPC%d exception pending", gpc);
5374 gpc_offset = proj_gpc_stride_v() * gpc;
5376 gpc_exception = gk20a_readl(g, gr_gpc0_gpccs_gpc_exception_r()
5379 /* check if any tpc has an exception */
5380 for (tpc = 0; tpc < gr->tpc_count; tpc++) {
5381 if ((gr_gpc0_gpccs_gpc_exception_tpc_v(gpc_exception) &
5385 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5386 "GPC%d: TPC%d exception pending", gpc, tpc);
5388 tpc_offset = proj_tpc_in_gpc_stride_v() * tpc;
5390 global_esr = gk20a_readl(g,
5391 gr_gpc0_tpc0_sm_hww_global_esr_r() +
5392 gpc_offset + tpc_offset);
5394 ret = gk20a_gr_handle_tpc_exception(g, gpc, tpc,
5397 /* clear the hwws, also causes tpc and gpc
5398 * exceptions to be cleared */
5399 gk20a_gr_clear_sm_hww(g, gpc, tpc, global_esr);
5406 int gk20a_gr_isr(struct gk20a *g)
5408 struct gr_isr_data isr_data;
5412 u32 gr_intr = gk20a_readl(g, gr_intr_r());
5413 struct channel_gk20a *ch = NULL;
5416 gk20a_dbg(gpu_dbg_intr, "pgraph intr %08x", gr_intr);
5421 grfifo_ctl = gk20a_readl(g, gr_gpfifo_ctl_r());
5422 grfifo_ctl &= ~gr_gpfifo_ctl_semaphore_access_f(1);
5423 grfifo_ctl &= ~gr_gpfifo_ctl_access_f(1);
5425 gk20a_writel(g, gr_gpfifo_ctl_r(),
5426 grfifo_ctl | gr_gpfifo_ctl_access_f(0) |
5427 gr_gpfifo_ctl_semaphore_access_f(0));
5429 isr_data.addr = gk20a_readl(g, gr_trapped_addr_r());
5430 isr_data.data_lo = gk20a_readl(g, gr_trapped_data_lo_r());
5431 isr_data.data_hi = gk20a_readl(g, gr_trapped_data_hi_r());
5432 isr_data.curr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
5433 isr_data.offset = gr_trapped_addr_mthd_v(isr_data.addr);
5434 isr_data.sub_chan = gr_trapped_addr_subch_v(isr_data.addr);
5435 obj_table = (isr_data.sub_chan < 4) ? gk20a_readl(g,
5436 gr_fe_object_table_r(isr_data.sub_chan)) : 0;
5437 isr_data.class_num = gr_fe_object_table_nvclass_v(obj_table);
5439 ch = gk20a_gr_get_channel_from_ctx(g, isr_data.curr_ctx, NULL);
5441 gk20a_err(dev_from_gk20a(g), "invalid channel ctx 0x%08x",
5445 isr_data.chid = ch->hw_chid;
5447 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5448 "channel %d: addr 0x%08x, "
5449 "data 0x%08x 0x%08x,"
5450 "ctx 0x%08x, offset 0x%08x, "
5451 "subchannel 0x%08x, class 0x%08x",
5452 isr_data.chid, isr_data.addr,
5453 isr_data.data_hi, isr_data.data_lo,
5454 isr_data.curr_ctx, isr_data.offset,
5455 isr_data.sub_chan, isr_data.class_num);
5457 if (gr_intr & gr_intr_notify_pending_f()) {
5458 gk20a_gr_handle_notify_pending(g, &isr_data);
5459 gk20a_writel(g, gr_intr_r(),
5460 gr_intr_notify_reset_f());
5461 gr_intr &= ~gr_intr_notify_pending_f();
5464 if (gr_intr & gr_intr_semaphore_pending_f()) {
5465 gk20a_gr_handle_semaphore_pending(g, &isr_data);
5466 gk20a_writel(g, gr_intr_r(),
5467 gr_intr_semaphore_reset_f());
5468 gr_intr &= ~gr_intr_semaphore_pending_f();
5471 if (gr_intr & gr_intr_semaphore_timeout_pending_f()) {
5472 need_reset |= gk20a_gr_handle_semaphore_timeout_pending(g,
5474 gk20a_writel(g, gr_intr_r(),
5475 gr_intr_semaphore_reset_f());
5476 gr_intr &= ~gr_intr_semaphore_pending_f();
5479 if (gr_intr & gr_intr_illegal_notify_pending_f()) {
5480 need_reset |= gk20a_gr_intr_illegal_notify_pending(g,
5482 gk20a_writel(g, gr_intr_r(),
5483 gr_intr_illegal_notify_reset_f());
5484 gr_intr &= ~gr_intr_illegal_notify_pending_f();
5487 if (gr_intr & gr_intr_illegal_method_pending_f()) {
5488 need_reset |= gk20a_gr_handle_illegal_method(g, &isr_data);
5489 gk20a_writel(g, gr_intr_r(),
5490 gr_intr_illegal_method_reset_f());
5491 gr_intr &= ~gr_intr_illegal_method_pending_f();
5494 if (gr_intr & gr_intr_illegal_class_pending_f()) {
5495 need_reset |= gk20a_gr_handle_illegal_class(g, &isr_data);
5496 gk20a_writel(g, gr_intr_r(),
5497 gr_intr_illegal_class_reset_f());
5498 gr_intr &= ~gr_intr_illegal_class_pending_f();
5501 if (gr_intr & gr_intr_fecs_error_pending_f()) {
5502 need_reset |= gk20a_gr_handle_fecs_error(g, &isr_data);
5503 gk20a_writel(g, gr_intr_r(),
5504 gr_intr_fecs_error_reset_f());
5505 gr_intr &= ~gr_intr_fecs_error_pending_f();
5508 if (gr_intr & gr_intr_class_error_pending_f()) {
5509 need_reset |= gk20a_gr_handle_class_error(g, &isr_data);
5510 gk20a_writel(g, gr_intr_r(),
5511 gr_intr_class_error_reset_f());
5512 gr_intr &= ~gr_intr_class_error_pending_f();
5515 /* this one happens if someone tries to hit a non-whitelisted
5516 * register using set_falcon[4] */
5517 if (gr_intr & gr_intr_firmware_method_pending_f()) {
5518 need_reset |= gk20a_gr_handle_firmware_method(g, &isr_data);
5519 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "firmware method intr pending\n");
5520 gk20a_writel(g, gr_intr_r(),
5521 gr_intr_firmware_method_reset_f());
5522 gr_intr &= ~gr_intr_firmware_method_pending_f();
5525 if (gr_intr & gr_intr_exception_pending_f()) {
5526 u32 exception = gk20a_readl(g, gr_exception_r());
5528 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "exception %08x\n", exception);
5530 if (exception & gr_exception_fe_m()) {
5531 u32 fe = gk20a_readl(g, gr_fe_hww_esr_r());
5532 gk20a_dbg(gpu_dbg_intr, "fe warning %08x\n", fe);
5533 gk20a_writel(g, gr_fe_hww_esr_r(), fe);
5534 need_reset |= -EFAULT;
5537 if (exception & gr_exception_memfmt_m()) {
5538 u32 memfmt = gk20a_readl(g, gr_memfmt_hww_esr_r());
5539 gk20a_dbg(gpu_dbg_intr, "memfmt exception %08x\n",
5541 gk20a_writel(g, gr_memfmt_hww_esr_r(), memfmt);
5544 /* check if a gpc exception has occurred */
5545 if (exception & gr_exception_gpc_m() && need_reset == 0) {
5546 struct channel_gk20a *fault_ch;
5548 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "GPC exception pending");
5550 /* if no sm debugger is present, clean up the channel */
5551 if (!gk20a_gr_sm_debugger_attached(g)) {
5552 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5553 "SM debugger not attached, clearing interrupt");
5554 need_reset |= -EFAULT;
5556 bool post_event = false;
5558 /* check if any gpc has an exception */
5559 need_reset |= gk20a_gr_handle_gpc_exception(g,
5562 /* signal clients waiting on an event */
5563 fault_ch = channel_from_hw_chid(g,
5565 if (post_event && fault_ch) {
5566 if (gk20a_is_channel_marked_as_tsg(fault_ch)) {
5567 struct tsg_gk20a *tsg = &g->fifo.tsg[fault_ch->tsgid];
5568 struct channel_gk20a *__ch;
5570 mutex_lock(&tsg->ch_list_lock);
5571 list_for_each_entry(__ch, &tsg->ch_list, ch_entry) {
5572 gk20a_dbg_gpu_post_events(__ch);
5574 mutex_unlock(&tsg->ch_list_lock);
5577 gk20a_dbg_gpu_post_events(fault_ch);
5583 gk20a_set_error_notifier(ch,
5584 NVGPU_CHANNEL_GR_ERROR_SW_NOTIFY);
5587 if (exception & gr_exception_ds_m()) {
5588 u32 ds = gk20a_readl(g, gr_ds_hww_esr_r());
5589 gk20a_dbg(gpu_dbg_intr, "ds exception %08x\n", ds);
5590 gk20a_writel(g, gr_ds_hww_esr_r(), ds);
5593 gk20a_writel(g, gr_intr_r(), gr_intr_exception_reset_f());
5594 gr_intr &= ~gr_intr_exception_pending_f();
5598 gk20a_fifo_recover(g, BIT(ENGINE_GR_GK20A),
5599 ~(u32)0, false, false, true);
5602 if (gr_intr && !ch) {
5603 /* Clear interrupts for unused channel. This is
5604 probably an interrupt during gk20a_free_channel() */
5605 gk20a_err(dev_from_gk20a(g),
5606 "unhandled gr interrupt 0x%08x for unreferenceable channel, clearing",
5608 gk20a_writel(g, gr_intr_r(), gr_intr);
5612 gk20a_writel(g, gr_gpfifo_ctl_r(),
5613 grfifo_ctl | gr_gpfifo_ctl_access_f(1) |
5614 gr_gpfifo_ctl_semaphore_access_f(1));
5617 gk20a_err(dev_from_gk20a(g),
5618 "unhandled gr interrupt 0x%08x", gr_intr);
5621 gk20a_channel_put(ch);
5626 int gk20a_gr_nonstall_isr(struct gk20a *g)
5628 u32 gr_intr = gk20a_readl(g, gr_intr_nonstall_r());
5630 gk20a_dbg(gpu_dbg_intr, "pgraph nonstall intr %08x", gr_intr);
5632 if (gr_intr & gr_intr_nonstall_trap_pending_f()) {
5633 /* Clear the interrupt */
5634 gk20a_writel(g, gr_intr_nonstall_r(),
5635 gr_intr_nonstall_trap_pending_f());
5636 /* Wakeup all the waiting channels */
5637 gk20a_channel_semaphore_wakeup(g);
5643 int gr_gk20a_fecs_get_reglist_img_size(struct gk20a *g, u32 *size)
5645 BUG_ON(size == NULL);
5646 return gr_gk20a_submit_fecs_method_op(g,
5647 (struct fecs_method_op_gk20a) {
5652 .method.addr = gr_fecs_method_push_adr_discover_reglist_image_size_v(),
5653 .mailbox.ret = size,
5654 .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
5656 .cond.fail = GR_IS_UCODE_OP_SKIP,
5657 .mailbox.fail = 0}, false);
5660 int gr_gk20a_fecs_set_reglist_bind_inst(struct gk20a *g, phys_addr_t addr)
5662 return gr_gk20a_submit_fecs_method_op(g,
5663 (struct fecs_method_op_gk20a){
5665 .mailbox.data = (gr_fecs_current_ctx_ptr_f(addr >> 12) |
5666 gr_fecs_current_ctx_valid_f(1) |
5667 gr_fecs_current_ctx_target_vid_mem_f()),
5670 .method.addr = gr_fecs_method_push_adr_set_reglist_bind_instance_v(),
5671 .mailbox.ret = NULL,
5672 .cond.ok = GR_IS_UCODE_OP_EQUAL,
5674 .cond.fail = GR_IS_UCODE_OP_SKIP,
5675 .mailbox.fail = 0}, false);
5678 int gr_gk20a_fecs_set_reglist_virtual_addr(struct gk20a *g, u64 pmu_va)
5680 return gr_gk20a_submit_fecs_method_op(g,
5681 (struct fecs_method_op_gk20a) {
5683 .mailbox.data = u64_lo32(pmu_va >> 8),
5686 .method.addr = gr_fecs_method_push_adr_set_reglist_virtual_address_v(),
5687 .mailbox.ret = NULL,
5688 .cond.ok = GR_IS_UCODE_OP_EQUAL,
5690 .cond.fail = GR_IS_UCODE_OP_SKIP,
5691 .mailbox.fail = 0}, false);
5694 int gk20a_gr_suspend(struct gk20a *g)
5696 unsigned long end_jiffies = jiffies +
5697 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
5702 ret = g->ops.gr.wait_empty(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
5706 gk20a_writel(g, gr_gpfifo_ctl_r(),
5707 gr_gpfifo_ctl_access_disabled_f());
5709 /* disable gr intr */
5710 gk20a_writel(g, gr_intr_r(), 0);
5711 gk20a_writel(g, gr_intr_en_r(), 0);
5713 /* disable all exceptions */
5714 gk20a_writel(g, gr_exception_r(), 0);
5715 gk20a_writel(g, gr_exception_en_r(), 0);
5716 gk20a_writel(g, gr_exception1_r(), 0);
5717 gk20a_writel(g, gr_exception1_en_r(), 0);
5718 gk20a_writel(g, gr_exception2_r(), 0);
5719 gk20a_writel(g, gr_exception2_en_r(), 0);
5721 gk20a_gr_flush_channel_tlb(&g->gr);
5723 g->gr.initialized = false;
5725 gk20a_dbg_fn("done");
5729 static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
5731 bool is_quad, u32 quad,
5732 u32 *context_buffer,
5733 u32 context_buffer_size,
5736 /* This function will decode a priv address and return the partition type and numbers. */
5737 static int gr_gk20a_decode_priv_addr(struct gk20a *g, u32 addr,
5738 int *addr_type, /* enum ctxsw_addr_type */
5739 u32 *gpc_num, u32 *tpc_num, u32 *ppc_num, u32 *be_num,
5740 u32 *broadcast_flags)
5744 u32 ppc_broadcast_addr;
5746 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
5748 /* setup defaults */
5750 ppc_broadcast_addr = 0;
5751 *addr_type = CTXSW_ADDR_TYPE_SYS;
5752 *broadcast_flags = PRI_BROADCAST_FLAGS_NONE;
5758 if (pri_is_gpc_addr(addr)) {
5759 *addr_type = CTXSW_ADDR_TYPE_GPC;
5760 gpc_addr = pri_gpccs_addr_mask(addr);
5761 if (pri_is_gpc_addr_shared(addr)) {
5762 *addr_type = CTXSW_ADDR_TYPE_GPC;
5763 *broadcast_flags |= PRI_BROADCAST_FLAGS_GPC;
5765 *gpc_num = pri_get_gpc_num(addr);
5767 if (g->ops.gr.is_tpc_addr(gpc_addr)) {
5768 *addr_type = CTXSW_ADDR_TYPE_TPC;
5769 if (pri_is_tpc_addr_shared(gpc_addr)) {
5770 *broadcast_flags |= PRI_BROADCAST_FLAGS_TPC;
5773 *tpc_num = g->ops.gr.get_tpc_num(gpc_addr);
5776 } else if (pri_is_be_addr(addr)) {
5777 *addr_type = CTXSW_ADDR_TYPE_BE;
5778 if (pri_is_be_addr_shared(addr)) {
5779 *broadcast_flags |= PRI_BROADCAST_FLAGS_BE;
5782 *be_num = pri_get_be_num(addr);
5785 *addr_type = CTXSW_ADDR_TYPE_SYS;
5794 static int gr_gk20a_split_ppc_broadcast_addr(struct gk20a *g, u32 addr,
5796 u32 *priv_addr_table, u32 *t)
5800 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
5802 for (ppc_num = 0; ppc_num < g->gr.pe_count_per_gpc; ppc_num++)
5803 priv_addr_table[(*t)++] = pri_ppc_addr(pri_ppccs_addr_mask(addr),
5810 * The context buffer is indexed using BE broadcast addresses and GPC/TPC
5811 * unicast addresses. This function will convert a BE unicast address to a BE
5812 * broadcast address and split a GPC/TPC broadcast address into a table of
5813 * GPC/TPC addresses. The addresses generated by this function can be
5814 * successfully processed by gr_gk20a_find_priv_offset_in_buffer
5816 static int gr_gk20a_create_priv_addr_table(struct gk20a *g,
5818 u32 *priv_addr_table,
5821 int addr_type; /*enum ctxsw_addr_type */
5822 u32 gpc_num, tpc_num, ppc_num, be_num;
5823 u32 broadcast_flags;
5830 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
5832 err = gr_gk20a_decode_priv_addr(g, addr, &addr_type,
5833 &gpc_num, &tpc_num, &ppc_num, &be_num,
5835 gk20a_dbg(gpu_dbg_gpu_dbg, "addr_type = %d", addr_type);
5839 if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
5840 (addr_type == CTXSW_ADDR_TYPE_BE)) {
5841 /* The BE broadcast registers are included in the compressed PRI
5842 * table. Convert a BE unicast address to a broadcast address
5843 * so that we can look up the offset. */
5844 if ((addr_type == CTXSW_ADDR_TYPE_BE) &&
5845 !(broadcast_flags & PRI_BROADCAST_FLAGS_BE))
5846 priv_addr_table[t++] = pri_be_shared_addr(addr);
5848 priv_addr_table[t++] = addr;
5854 /* The GPC/TPC unicast registers are included in the compressed PRI
5855 * tables. Convert a GPC/TPC broadcast address to unicast addresses so
5856 * that we can look up the offsets. */
5857 if (broadcast_flags & PRI_BROADCAST_FLAGS_GPC) {
5858 for (gpc_num = 0; gpc_num < g->gr.gpc_count; gpc_num++) {
5860 if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
5862 tpc_num < g->gr.gpc_tpc_count[gpc_num];
5864 priv_addr_table[t++] =
5865 pri_tpc_addr(pri_tpccs_addr_mask(addr),
5868 else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC) {
5869 err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
5870 priv_addr_table, &t);
5874 priv_addr_table[t++] =
5875 pri_gpc_addr(pri_gpccs_addr_mask(addr),
5879 if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
5881 tpc_num < g->gr.gpc_tpc_count[gpc_num];
5883 priv_addr_table[t++] =
5884 pri_tpc_addr(pri_tpccs_addr_mask(addr),
5886 else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC)
5887 err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
5888 priv_addr_table, &t);
5890 priv_addr_table[t++] = addr;
5897 int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
5900 u32 *offsets, u32 *offset_addrs,
5902 bool is_quad, u32 quad)
5905 u32 priv_offset = 0;
5906 u32 *priv_registers;
5907 u32 num_registers = 0;
5909 struct gr_gk20a *gr = &g->gr;
5910 u32 potential_offsets = gr->max_gpc_count * gr->max_tpc_per_gpc_count;
5912 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
5914 /* implementation is crossed-up if either of these happen */
5915 if (max_offsets > potential_offsets)
5918 if (!g->gr.ctx_vars.golden_image_initialized)
5921 priv_registers = kzalloc(sizeof(u32) * potential_offsets, GFP_KERNEL);
5922 if (IS_ERR_OR_NULL(priv_registers)) {
5923 gk20a_dbg_fn("failed alloc for potential_offsets=%d", potential_offsets);
5924 err = PTR_ERR(priv_registers);
5927 memset(offsets, 0, sizeof(u32) * max_offsets);
5928 memset(offset_addrs, 0, sizeof(u32) * max_offsets);
5931 gr_gk20a_create_priv_addr_table(g, addr, &priv_registers[0], &num_registers);
5933 if ((max_offsets > 1) && (num_registers > max_offsets)) {
5938 if ((max_offsets == 1) && (num_registers > 1))
5941 if (!g->gr.ctx_vars.local_golden_image) {
5942 gk20a_dbg_fn("no context switch header info to work with");
5947 for (i = 0; i < num_registers; i++) {
5948 err = gr_gk20a_find_priv_offset_in_buffer(g,
5951 g->gr.ctx_vars.local_golden_image,
5952 g->gr.ctx_vars.golden_image_size,
5955 gk20a_dbg_fn("Could not determine priv_offset for addr:0x%x",
5956 addr); /*, grPriRegStr(addr)));*/
5960 offsets[i] = priv_offset;
5961 offset_addrs[i] = priv_registers[i];
5964 *num_offsets = num_registers;
5968 if (!IS_ERR_OR_NULL(priv_registers))
5969 kfree(priv_registers);
5974 /* Setup some register tables. This looks hacky; our
5975 * register/offset functions are just that, functions.
5976 * So they can't be used as initializers... TBD: fix to
5977 * generate consts at least on an as-needed basis.
5979 static const u32 _num_ovr_perf_regs = 17;
5980 static u32 _ovr_perf_regs[17] = { 0, };
5981 /* Following are the blocks of registers that the ucode
5982 stores in the extended region.*/
5983 /* == ctxsw_extended_sm_dsm_perf_counter_register_stride_v() ? */
5984 static const u32 _num_sm_dsm_perf_regs = 5;
5985 /* == ctxsw_extended_sm_dsm_perf_counter_control_register_stride_v() ?*/
5986 static const u32 _num_sm_dsm_perf_ctrl_regs = 4;
5987 static u32 _sm_dsm_perf_regs[5];
5988 static u32 _sm_dsm_perf_ctrl_regs[4];
5990 static void init_ovr_perf_reg_info(void)
5992 if (_ovr_perf_regs[0] != 0)
5995 _ovr_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel0_r();
5996 _ovr_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel1_r();
5997 _ovr_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control0_r();
5998 _ovr_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control5_r();
5999 _ovr_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status1_r();
6000 _ovr_perf_regs[5] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_control_r();
6001 _ovr_perf_regs[6] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_control_r();
6002 _ovr_perf_regs[7] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_control_r();
6003 _ovr_perf_regs[8] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_control_r();
6004 _ovr_perf_regs[9] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_control_r();
6005 _ovr_perf_regs[10] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_control_r();
6006 _ovr_perf_regs[11] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_control_r();
6007 _ovr_perf_regs[12] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_control_r();
6008 _ovr_perf_regs[13] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_r();
6009 _ovr_perf_regs[14] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_r();
6010 _ovr_perf_regs[15] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_r();
6011 _ovr_perf_regs[16] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_r();
6014 static void gr_gk20a_init_sm_dsm_reg_info(void)
6016 if (_sm_dsm_perf_regs[0] != 0)
6019 _sm_dsm_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status_r();
6020 _sm_dsm_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_r();
6021 _sm_dsm_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_r();
6022 _sm_dsm_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_r();
6023 _sm_dsm_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_r();
6025 _sm_dsm_perf_ctrl_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control1_r();
6026 _sm_dsm_perf_ctrl_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control2_r();
6027 _sm_dsm_perf_ctrl_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control3_r();
6028 _sm_dsm_perf_ctrl_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control4_r();
6032 /* TBD: would like to handle this elsewhere, at a higher level.
6033 * these are currently constructed in a "test-then-write" style
6034 * which makes it impossible to know externally whether a ctx
6035 * write will actually occur. so later we should put a lazy,
6036 * map-and-hold system in the patch write state */
6037 static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
6038 struct channel_ctx_gk20a *ch_ctx,
6042 u32 num_gpc = g->gr.gpc_count;
6050 init_ovr_perf_reg_info();
6051 g->ops.gr.init_sm_dsm_reg_info();
6053 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
6055 for (reg = 0; reg < _num_ovr_perf_regs; reg++) {
6056 for (gpc = 0; gpc < num_gpc; gpc++) {
6057 num_tpc = g->gr.gpc_tpc_count[gpc];
6058 for (tpc = 0; tpc < num_tpc; tpc++) {
6059 chk_addr = ((proj_gpc_stride_v() * gpc) +
6060 (proj_tpc_in_gpc_stride_v() * tpc) +
6061 _ovr_perf_regs[reg]);
6062 if (chk_addr != addr)
6064 /* reset the patch count from previous
6065 runs,if ucode has already processed
6067 tmp = gk20a_mem_rd32(context +
6068 ctxsw_prog_main_image_patch_count_o(), 0);
6071 ch_ctx->patch_ctx.data_count = 0;
6073 gr_gk20a_ctx_patch_write(g, ch_ctx,
6076 vaddr_lo = u64_lo32(ch_ctx->patch_ctx.mem.gpu_va);
6077 vaddr_hi = u64_hi32(ch_ctx->patch_ctx.mem.gpu_va);
6079 gk20a_mem_wr32(context +
6080 ctxsw_prog_main_image_patch_count_o(),
6081 0, ch_ctx->patch_ctx.data_count);
6082 gk20a_mem_wr32(context +
6083 ctxsw_prog_main_image_patch_adr_lo_o(),
6085 gk20a_mem_wr32(context +
6086 ctxsw_prog_main_image_patch_adr_hi_o(),
6089 /* we're not caching these on cpu side,
6090 but later watch for it */
6099 static void gr_gk20a_access_smpc_reg(struct gk20a *g, u32 quad, u32 offset)
6108 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "offset=0x%x", offset);
6110 gpc = pri_get_gpc_num(offset);
6111 gpc_tpc_addr = pri_gpccs_addr_mask(offset);
6112 tpc = g->ops.gr.get_tpc_num(gpc_tpc_addr);
6114 quad_ctrl = quad & 0x1; /* first bit tells us quad */
6115 half_ctrl = (quad >> 1) & 0x1; /* second bit tells us half */
6117 gpc_tpc_stride = gpc * proj_gpc_stride_v() +
6118 tpc * proj_tpc_in_gpc_stride_v();
6119 gpc_tpc_addr = gr_gpc0_tpc0_sm_halfctl_ctrl_r() + gpc_tpc_stride;
6121 reg = gk20a_readl(g, gpc_tpc_addr);
6122 reg = set_field(reg,
6123 gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_read_quad_ctl_m(),
6124 gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_read_quad_ctl_f(quad_ctrl));
6126 gk20a_writel(g, gpc_tpc_addr, reg);
6128 gpc_tpc_addr = gr_gpc0_tpc0_sm_debug_sfe_control_r() + gpc_tpc_stride;
6129 reg = gk20a_readl(g, gpc_tpc_addr);
6130 reg = set_field(reg,
6131 gr_gpcs_tpcs_sm_debug_sfe_control_read_half_ctl_m(),
6132 gr_gpcs_tpcs_sm_debug_sfe_control_read_half_ctl_f(half_ctrl));
6133 gk20a_writel(g, gpc_tpc_addr, reg);
6136 #define ILLEGAL_ID (~0)
6138 static inline bool check_main_image_header_magic(void *context)
6140 u32 magic = gk20a_mem_rd32(context +
6141 ctxsw_prog_main_image_magic_value_o(), 0);
6142 gk20a_dbg(gpu_dbg_gpu_dbg, "main image magic=0x%x", magic);
6143 return magic == ctxsw_prog_main_image_magic_value_v_value_v();
6145 static inline bool check_local_header_magic(void *context)
6147 u32 magic = gk20a_mem_rd32(context +
6148 ctxsw_prog_local_magic_value_o(), 0);
6149 gk20a_dbg(gpu_dbg_gpu_dbg, "local magic=0x%x", magic);
6150 return magic == ctxsw_prog_local_magic_value_v_value_v();
6154 /* most likely dupe of ctxsw_gpccs_header__size_1_v() */
6155 static inline int ctxsw_prog_ucode_header_size_in_bytes(void)
6160 static void gr_gk20a_get_sm_dsm_perf_regs(struct gk20a *g,
6161 u32 *num_sm_dsm_perf_regs,
6162 u32 **sm_dsm_perf_regs,
6163 u32 *perf_register_stride)
6165 *num_sm_dsm_perf_regs = _num_sm_dsm_perf_regs;
6166 *sm_dsm_perf_regs = _sm_dsm_perf_regs;
6167 *perf_register_stride = ctxsw_prog_extended_sm_dsm_perf_counter_register_stride_v();
6170 static void gr_gk20a_get_sm_dsm_perf_ctrl_regs(struct gk20a *g,
6171 u32 *num_sm_dsm_perf_ctrl_regs,
6172 u32 **sm_dsm_perf_ctrl_regs,
6173 u32 *ctrl_register_stride)
6175 *num_sm_dsm_perf_ctrl_regs = _num_sm_dsm_perf_ctrl_regs;
6176 *sm_dsm_perf_ctrl_regs = _sm_dsm_perf_ctrl_regs;
6177 *ctrl_register_stride = ctxsw_prog_extended_sm_dsm_perf_counter_control_register_stride_v();
6180 static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
6182 bool is_quad, u32 quad,
6183 u32 *context_buffer,
6184 u32 context_buffer_size,
6188 u32 gpc_num, tpc_num;
6189 u32 num_gpcs, num_tpcs;
6191 u32 ext_priv_offset, ext_priv_size;
6193 u32 offset_to_segment, offset_to_segment_end;
6194 u32 sm_dsm_perf_reg_id = ILLEGAL_ID;
6195 u32 sm_dsm_perf_ctrl_reg_id = ILLEGAL_ID;
6196 u32 num_ext_gpccs_ext_buffer_segments;
6197 u32 inter_seg_offset;
6198 u32 tpc_gpc_mask = (proj_tpc_in_gpc_stride_v() - 1);
6200 u32 *sm_dsm_perf_ctrl_regs = NULL;
6201 u32 num_sm_dsm_perf_ctrl_regs = 0;
6202 u32 *sm_dsm_perf_regs = NULL;
6203 u32 num_sm_dsm_perf_regs = 0;
6204 u32 buffer_segments_size = 0;
6205 u32 marker_size = 0;
6206 u32 control_register_stride = 0;
6207 u32 perf_register_stride = 0;
6208 struct gr_gk20a *gr = &g->gr;
6210 /* Only have TPC registers in extended region, so if not a TPC reg,
6211 then return error so caller can look elsewhere. */
6212 if (pri_is_gpc_addr(addr)) {
6214 gpc_num = pri_get_gpc_num(addr);
6215 gpc_addr = pri_gpccs_addr_mask(addr);
6216 if (g->ops.gr.is_tpc_addr(gpc_addr))
6217 tpc_num = g->ops.gr.get_tpc_num(gpc_addr);
6221 gk20a_dbg_info(" gpc = %d tpc = %d",
6226 buffer_segments_size = ctxsw_prog_extended_buffer_segments_size_in_bytes_v();
6227 /* note below is in words/num_registers */
6228 marker_size = ctxsw_prog_extended_marker_size_in_bytes_v() >> 2;
6230 context = context_buffer;
6231 /* sanity check main header */
6232 if (!check_main_image_header_magic(context)) {
6233 gk20a_err(dev_from_gk20a(g),
6234 "Invalid main header: magic value");
6237 num_gpcs = gk20a_mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0);
6238 if (gpc_num >= num_gpcs) {
6239 gk20a_err(dev_from_gk20a(g),
6240 "GPC 0x%08x is greater than total count 0x%08x!\n",
6245 data32 = gk20a_mem_rd32(context + ctxsw_prog_main_extended_buffer_ctl_o(), 0);
6246 ext_priv_size = ctxsw_prog_main_extended_buffer_ctl_size_v(data32);
6247 if (0 == ext_priv_size) {
6248 gk20a_dbg_info(" No extended memory in context buffer");
6251 ext_priv_offset = ctxsw_prog_main_extended_buffer_ctl_offset_v(data32);
6253 offset_to_segment = ext_priv_offset * ctxsw_prog_ucode_header_size_in_bytes();
6254 offset_to_segment_end = offset_to_segment +
6255 (ext_priv_size * buffer_segments_size);
6257 /* check local header magic */
6258 context += ctxsw_prog_ucode_header_size_in_bytes();
6259 if (!check_local_header_magic(context)) {
6260 gk20a_err(dev_from_gk20a(g),
6261 "Invalid local header: magic value\n");
6266 * See if the incoming register address is in the first table of
6267 * registers. We check this by decoding only the TPC addr portion.
6268 * If we get a hit on the TPC bit, we then double check the address
6269 * by computing it from the base gpc/tpc strides. Then make sure
6270 * it is a real match.
6272 g->ops.gr.get_sm_dsm_perf_regs(g, &num_sm_dsm_perf_regs,
6274 &perf_register_stride);
6276 g->ops.gr.init_sm_dsm_reg_info();
6278 for (i = 0; i < num_sm_dsm_perf_regs; i++) {
6279 if ((addr & tpc_gpc_mask) == (sm_dsm_perf_regs[i] & tpc_gpc_mask)) {
6280 sm_dsm_perf_reg_id = i;
6282 gk20a_dbg_info("register match: 0x%08x",
6283 sm_dsm_perf_regs[i]);
6285 chk_addr = (proj_gpc_base_v() +
6286 (proj_gpc_stride_v() * gpc_num) +
6287 proj_tpc_in_gpc_base_v() +
6288 (proj_tpc_in_gpc_stride_v() * tpc_num) +
6289 (sm_dsm_perf_regs[sm_dsm_perf_reg_id] & tpc_gpc_mask));
6291 if (chk_addr != addr) {
6292 gk20a_err(dev_from_gk20a(g),
6293 "Oops addr miss-match! : 0x%08x != 0x%08x\n",
6301 /* Didn't find reg in supported group 1.
6302 * so try the second group now */
6303 g->ops.gr.get_sm_dsm_perf_ctrl_regs(g, &num_sm_dsm_perf_ctrl_regs,
6304 &sm_dsm_perf_ctrl_regs,
6305 &control_register_stride);
6307 if (ILLEGAL_ID == sm_dsm_perf_reg_id) {
6308 for (i = 0; i < num_sm_dsm_perf_ctrl_regs; i++) {
6309 if ((addr & tpc_gpc_mask) ==
6310 (sm_dsm_perf_ctrl_regs[i] & tpc_gpc_mask)) {
6311 sm_dsm_perf_ctrl_reg_id = i;
6313 gk20a_dbg_info("register match: 0x%08x",
6314 sm_dsm_perf_ctrl_regs[i]);
6316 chk_addr = (proj_gpc_base_v() +
6317 (proj_gpc_stride_v() * gpc_num) +
6318 proj_tpc_in_gpc_base_v() +
6319 (proj_tpc_in_gpc_stride_v() * tpc_num) +
6320 (sm_dsm_perf_ctrl_regs[sm_dsm_perf_ctrl_reg_id] &
6323 if (chk_addr != addr) {
6324 gk20a_err(dev_from_gk20a(g),
6325 "Oops addr miss-match! : 0x%08x != 0x%08x\n",
6336 if ((ILLEGAL_ID == sm_dsm_perf_ctrl_reg_id) &&
6337 (ILLEGAL_ID == sm_dsm_perf_reg_id))
6340 /* Skip the FECS extended header, nothing there for us now. */
6341 offset_to_segment += buffer_segments_size;
6343 /* skip through the GPCCS extended headers until we get to the data for
6344 * our GPC. The size of each gpc extended segment is enough to hold the
6345 * max tpc count for the gpcs,in 256b chunks.
6348 max_tpc_count = gr->max_tpc_per_gpc_count;
6350 num_ext_gpccs_ext_buffer_segments = (u32)((max_tpc_count + 1) / 2);
6352 offset_to_segment += (num_ext_gpccs_ext_buffer_segments *
6353 buffer_segments_size * gpc_num);
6355 num_tpcs = g->gr.gpc_tpc_count[gpc_num];
6357 /* skip the head marker to start with */
6358 inter_seg_offset = marker_size;
6360 if (ILLEGAL_ID != sm_dsm_perf_ctrl_reg_id) {
6361 /* skip over control regs of TPC's before the one we want.
6362 * then skip to the register in this tpc */
6363 inter_seg_offset = inter_seg_offset +
6364 (tpc_num * control_register_stride) +
6365 sm_dsm_perf_ctrl_reg_id;
6367 /* skip all the control registers */
6368 inter_seg_offset = inter_seg_offset +
6369 (num_tpcs * control_register_stride);
6371 /* skip the marker between control and counter segments */
6372 inter_seg_offset += marker_size;
6374 /* skip over counter regs of TPCs before the one we want */
6375 inter_seg_offset = inter_seg_offset +
6376 (tpc_num * perf_register_stride) *
6377 ctxsw_prog_extended_num_smpc_quadrants_v();
6379 /* skip over the register for the quadrants we do not want.
6380 * then skip to the register in this tpc */
6381 inter_seg_offset = inter_seg_offset +
6382 (perf_register_stride * quad) +
6386 /* set the offset to the segment offset plus the inter segment offset to
6388 offset_to_segment += (inter_seg_offset * 4);
6390 /* last sanity check: did we somehow compute an offset outside the
6391 * extended buffer? */
6392 if (offset_to_segment > offset_to_segment_end) {
6393 gk20a_err(dev_from_gk20a(g),
6394 "Overflow ctxsw buffer! 0x%08x > 0x%08x\n",
6395 offset_to_segment, offset_to_segment_end);
6399 *priv_offset = offset_to_segment;
6406 gr_gk20a_process_context_buffer_priv_segment(struct gk20a *g,
6407 int addr_type,/* enum ctxsw_addr_type */
6409 u32 gpc_num, u32 num_tpcs,
6410 u32 num_ppcs, u32 ppc_mask,
6414 u32 address, base_address;
6415 u32 sys_offset, gpc_offset, tpc_offset, ppc_offset;
6416 u32 ppc_num, tpc_num, tpc_addr, gpc_addr, ppc_addr;
6417 struct aiv_gk20a *reg;
6419 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "pri_addr=0x%x", pri_addr);
6421 if (!g->gr.ctx_vars.valid)
6424 /* Process the SYS/BE segment. */
6425 if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
6426 (addr_type == CTXSW_ADDR_TYPE_BE)) {
6427 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.sys.count; i++) {
6428 reg = &g->gr.ctx_vars.ctxsw_regs.sys.l[i];
6429 address = reg->addr;
6430 sys_offset = reg->index;
6432 if (pri_addr == address) {
6433 *priv_offset = sys_offset;
6439 /* Process the TPC segment. */
6440 if (addr_type == CTXSW_ADDR_TYPE_TPC) {
6441 for (tpc_num = 0; tpc_num < num_tpcs; tpc_num++) {
6442 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.tpc.count; i++) {
6443 reg = &g->gr.ctx_vars.ctxsw_regs.tpc.l[i];
6444 address = reg->addr;
6445 tpc_addr = pri_tpccs_addr_mask(address);
6446 base_address = proj_gpc_base_v() +
6447 (gpc_num * proj_gpc_stride_v()) +
6448 proj_tpc_in_gpc_base_v() +
6449 (tpc_num * proj_tpc_in_gpc_stride_v());
6450 address = base_address + tpc_addr;
6452 * The data for the TPCs is interleaved in the context buffer.
6453 * Example with num_tpcs = 2
6454 * 0 1 2 3 4 5 6 7 8 9 10 11 ...
6455 * 0-0 1-0 0-1 1-1 0-2 1-2 0-3 1-3 0-4 1-4 0-5 1-5 ...
6457 tpc_offset = (reg->index * num_tpcs) + (tpc_num * 4);
6459 if (pri_addr == address) {
6460 *priv_offset = tpc_offset;
6467 /* Process the PPC segment. */
6468 if (addr_type == CTXSW_ADDR_TYPE_PPC) {
6469 for (ppc_num = 0; ppc_num < num_ppcs; ppc_num++) {
6470 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.ppc.count; i++) {
6471 reg = &g->gr.ctx_vars.ctxsw_regs.ppc.l[i];
6472 address = reg->addr;
6473 ppc_addr = pri_ppccs_addr_mask(address);
6474 base_address = proj_gpc_base_v() +
6475 (gpc_num * proj_gpc_stride_v()) +
6476 proj_ppc_in_gpc_base_v() +
6477 (ppc_num * proj_ppc_in_gpc_stride_v());
6478 address = base_address + ppc_addr;
6480 * The data for the PPCs is interleaved in the context buffer.
6481 * Example with numPpcs = 2
6482 * 0 1 2 3 4 5 6 7 8 9 10 11 ...
6483 * 0-0 1-0 0-1 1-1 0-2 1-2 0-3 1-3 0-4 1-4 0-5 1-5 ...
6485 ppc_offset = (reg->index * num_ppcs) + (ppc_num * 4);
6487 if (pri_addr == address) {
6488 *priv_offset = ppc_offset;
6496 /* Process the GPC segment. */
6497 if (addr_type == CTXSW_ADDR_TYPE_GPC) {
6498 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.gpc.count; i++) {
6499 reg = &g->gr.ctx_vars.ctxsw_regs.gpc.l[i];
6501 address = reg->addr;
6502 gpc_addr = pri_gpccs_addr_mask(address);
6503 gpc_offset = reg->index;
6505 base_address = proj_gpc_base_v() +
6506 (gpc_num * proj_gpc_stride_v());
6507 address = base_address + gpc_addr;
6509 if (pri_addr == address) {
6510 *priv_offset = gpc_offset;
6519 static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
6521 u32 *num_ppcs, u32 *ppc_mask,
6525 u32 litter_num_pes_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
6528 * if there is only 1 PES_PER_GPC, then we put the PES registers
6529 * in the GPC reglist, so we can't error out if ppc.count == 0
6531 if ((!g->gr.ctx_vars.valid) ||
6532 ((g->gr.ctx_vars.ctxsw_regs.ppc.count == 0) &&
6533 (litter_num_pes_per_gpc > 1)))
6536 data32 = gk20a_mem_rd32(context + ctxsw_prog_local_image_ppc_info_o(), 0);
6538 *num_ppcs = ctxsw_prog_local_image_ppc_info_num_ppcs_v(data32);
6539 *ppc_mask = ctxsw_prog_local_image_ppc_info_ppc_mask_v(data32);
6541 *reg_ppc_count = g->gr.ctx_vars.ctxsw_regs.ppc.count;
6549 * This function will return the 32 bit offset for a priv register if it is
6550 * present in the context buffer.
6552 static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
6554 bool is_quad, u32 quad,
6555 u32 *context_buffer,
6556 u32 context_buffer_size,
6559 struct gr_gk20a *gr = &g->gr;
6562 int addr_type; /*enum ctxsw_addr_type */
6563 u32 broadcast_flags;
6564 u32 gpc_num, tpc_num, ppc_num, be_num;
6565 u32 num_gpcs, num_tpcs, num_ppcs;
6567 u32 sys_priv_offset, gpc_priv_offset;
6568 u32 ppc_mask, reg_list_ppc_count;
6570 u32 offset_to_segment;
6572 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
6574 err = gr_gk20a_decode_priv_addr(g, addr, &addr_type,
6575 &gpc_num, &tpc_num, &ppc_num, &be_num,
6580 context = context_buffer;
6581 if (!check_main_image_header_magic(context)) {
6582 gk20a_err(dev_from_gk20a(g),
6583 "Invalid main header: magic value");
6586 num_gpcs = gk20a_mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0);
6588 /* Parse the FECS local header. */
6589 context += ctxsw_prog_ucode_header_size_in_bytes();
6590 if (!check_local_header_magic(context)) {
6591 gk20a_err(dev_from_gk20a(g),
6592 "Invalid FECS local header: magic value\n");
6595 data32 = gk20a_mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0);
6596 sys_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
6598 /* If found in Ext buffer, ok.
6599 * If it failed and we expected to find it there (quad offset)
6600 * then return the error. Otherwise continue on.
6602 err = gr_gk20a_find_priv_offset_in_ext_buffer(g,
6603 addr, is_quad, quad, context_buffer,
6604 context_buffer_size, priv_offset);
6605 if (!err || (err && is_quad))
6608 if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
6609 (addr_type == CTXSW_ADDR_TYPE_BE)) {
6610 /* Find the offset in the FECS segment. */
6611 offset_to_segment = sys_priv_offset *
6612 ctxsw_prog_ucode_header_size_in_bytes();
6614 err = gr_gk20a_process_context_buffer_priv_segment(g,
6621 *priv_offset = (offset_to_segment + offset);
6625 if ((gpc_num + 1) > num_gpcs) {
6626 gk20a_err(dev_from_gk20a(g),
6627 "GPC %d not in this context buffer.\n",
6632 /* Parse the GPCCS local header(s).*/
6633 for (i = 0; i < num_gpcs; i++) {
6634 context += ctxsw_prog_ucode_header_size_in_bytes();
6635 if (!check_local_header_magic(context)) {
6636 gk20a_err(dev_from_gk20a(g),
6637 "Invalid GPCCS local header: magic value\n");
6641 data32 = gk20a_mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0);
6642 gpc_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
6644 err = gr_gk20a_determine_ppc_configuration(g, context,
6645 &num_ppcs, &ppc_mask,
6646 ®_list_ppc_count);
6650 num_tpcs = gk20a_mem_rd32(context + ctxsw_prog_local_image_num_tpcs_o(), 0);
6652 if ((i == gpc_num) && ((tpc_num + 1) > num_tpcs)) {
6653 gk20a_err(dev_from_gk20a(g),
6654 "GPC %d TPC %d not in this context buffer.\n",
6659 /* Find the offset in the GPCCS segment.*/
6661 offset_to_segment = gpc_priv_offset *
6662 ctxsw_prog_ucode_header_size_in_bytes();
6664 if (addr_type == CTXSW_ADDR_TYPE_TPC) {
6665 /*reg = gr->ctx_vars.ctxsw_regs.tpc.l;*/
6666 } else if (addr_type == CTXSW_ADDR_TYPE_PPC) {
6667 /* The ucode stores TPC data before PPC data.
6668 * Advance offset past TPC data to PPC data. */
6669 offset_to_segment +=
6670 ((gr->ctx_vars.ctxsw_regs.tpc.count *
6672 } else if (addr_type == CTXSW_ADDR_TYPE_GPC) {
6673 /* The ucode stores TPC/PPC data before GPC data.
6674 * Advance offset past TPC/PPC data to GPC data. */
6675 /* note 1 PES_PER_GPC case */
6676 u32 litter_num_pes_per_gpc =
6677 proj_scal_litter_num_pes_per_gpc_v();
6678 if (litter_num_pes_per_gpc > 1) {
6679 offset_to_segment +=
6680 (((gr->ctx_vars.ctxsw_regs.tpc.count *
6682 ((reg_list_ppc_count * num_ppcs) << 2));
6684 offset_to_segment +=
6685 ((gr->ctx_vars.ctxsw_regs.tpc.count *
6689 gk20a_err(dev_from_gk20a(g),
6690 " Unknown address type.\n");
6693 err = gr_gk20a_process_context_buffer_priv_segment(g,
6701 *priv_offset = offset_to_segment + offset;
6709 bool gk20a_is_channel_ctx_resident(struct channel_gk20a *ch)
6711 int curr_gr_ctx, curr_gr_tsgid;
6712 struct gk20a *g = ch->g;
6713 struct channel_gk20a *curr_ch;
6716 curr_gr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
6717 curr_ch = gk20a_gr_get_channel_from_ctx(g, curr_gr_ctx,
6720 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg,
6721 "curr_gr_chid=%d curr_tsgid=%d, ch->tsgid=%d"
6723 curr_ch ? curr_ch->hw_chid : -1,
6731 if (ch->hw_chid == curr_ch->hw_chid)
6734 if (gk20a_is_channel_marked_as_tsg(ch) && (ch->tsgid == curr_gr_tsgid))
6737 gk20a_channel_put(curr_ch);
6741 int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
6742 struct nvgpu_dbg_gpu_reg_op *ctx_ops, u32 num_ops,
6743 u32 num_ctx_wr_ops, u32 num_ctx_rd_ops)
6745 struct gk20a *g = ch->g;
6746 struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
6747 void *ctx_ptr = NULL;
6748 bool ch_is_curr_ctx, restart_gr_ctxsw = false;
6749 u32 i, j, offset, v;
6750 struct gr_gk20a *gr = &g->gr;
6751 u32 max_offsets = gr->max_gpc_count * gr->max_tpc_per_gpc_count;
6752 u32 *offsets = NULL;
6753 u32 *offset_addrs = NULL;
6754 u32 ctx_op_nr, num_ctx_ops[2] = {num_ctx_wr_ops, num_ctx_rd_ops};
6757 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "wr_ops=%d rd_ops=%d",
6758 num_ctx_wr_ops, num_ctx_rd_ops);
6760 /* disable channel switching.
6761 * at that point the hardware state can be inspected to
6762 * determine if the context we're interested in is current.
6764 err = gr_gk20a_disable_ctxsw(g);
6766 gk20a_err(dev_from_gk20a(g), "unable to stop gr ctxsw");
6767 /* this should probably be ctx-fatal... */
6771 restart_gr_ctxsw = true;
6773 ch_is_curr_ctx = gk20a_is_channel_ctx_resident(ch);
6775 gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "is curr ctx=%d", ch_is_curr_ctx);
6777 if (ch_is_curr_ctx) {
6778 for (pass = 0; pass < 2; pass++) {
6780 for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
6781 /* only do ctx ops and only on the right pass */
6782 if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
6783 (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
6784 ((pass == 1) && !reg_op_is_read(ctx_ops[i].op))))
6787 /* if this is a quad access, setup for special access*/
6788 if (ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD)
6789 && g->ops.gr.access_smpc_reg)
6790 g->ops.gr.access_smpc_reg(g,
6793 offset = ctx_ops[i].offset;
6795 if (pass == 0) { /* write pass */
6796 v = gk20a_readl(g, offset);
6797 v &= ~ctx_ops[i].and_n_mask_lo;
6798 v |= ctx_ops[i].value_lo;
6799 gk20a_writel(g, offset, v);
6801 gk20a_dbg(gpu_dbg_gpu_dbg,
6802 "direct wr: offset=0x%x v=0x%x",
6805 if (ctx_ops[i].op == REGOP(WRITE_64)) {
6806 v = gk20a_readl(g, offset + 4);
6807 v &= ~ctx_ops[i].and_n_mask_hi;
6808 v |= ctx_ops[i].value_hi;
6809 gk20a_writel(g, offset + 4, v);
6811 gk20a_dbg(gpu_dbg_gpu_dbg,
6812 "direct wr: offset=0x%x v=0x%x",
6816 } else { /* read pass */
6817 ctx_ops[i].value_lo =
6818 gk20a_readl(g, offset);
6820 gk20a_dbg(gpu_dbg_gpu_dbg,
6821 "direct rd: offset=0x%x v=0x%x",
6822 offset, ctx_ops[i].value_lo);
6824 if (ctx_ops[i].op == REGOP(READ_64)) {
6825 ctx_ops[i].value_hi =
6826 gk20a_readl(g, offset + 4);
6828 gk20a_dbg(gpu_dbg_gpu_dbg,
6829 "direct rd: offset=0x%x v=0x%x",
6830 offset, ctx_ops[i].value_lo);
6832 ctx_ops[i].value_hi = 0;
6840 /* they're the same size, so just use one alloc for both */
6841 offsets = kzalloc(2 * sizeof(u32) * max_offsets, GFP_KERNEL);
6846 offset_addrs = offsets + max_offsets;
6848 /* would have been a variant of gr_gk20a_apply_instmem_overrides */
6849 /* recoded in-place instead.*/
6850 ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
6851 PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
6852 0, pgprot_dmacoherent(PAGE_KERNEL));
6858 g->ops.mm.l2_flush(g, true);
6860 /* write to appropriate place in context image,
6861 * first have to figure out where that really is */
6863 /* first pass is writes, second reads */
6864 for (pass = 0; pass < 2; pass++) {
6866 for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
6869 /* only do ctx ops and only on the right pass */
6870 if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
6871 (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
6872 ((pass == 1) && !reg_op_is_read(ctx_ops[i].op))))
6875 err = gr_gk20a_get_ctx_buffer_offsets(g,
6878 offsets, offset_addrs,
6880 ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD),
6883 gk20a_dbg(gpu_dbg_gpu_dbg,
6884 "ctx op invalid offset: offset=0x%x",
6887 NVGPU_DBG_GPU_REG_OP_STATUS_INVALID_OFFSET;
6891 /* if this is a quad access, setup for special access*/
6892 if (ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD) &&
6893 g->ops.gr.access_smpc_reg)
6894 g->ops.gr.access_smpc_reg(g, ctx_ops[i].quad,
6897 for (j = 0; j < num_offsets; j++) {
6898 /* sanity check, don't write outside, worst case */
6899 if (offsets[j] >= g->gr.ctx_vars.golden_image_size)
6901 if (pass == 0) { /* write pass */
6902 v = gk20a_mem_rd32(ctx_ptr + offsets[j], 0);
6903 v &= ~ctx_ops[i].and_n_mask_lo;
6904 v |= ctx_ops[i].value_lo;
6905 gk20a_mem_wr32(ctx_ptr + offsets[j], 0, v);
6907 gk20a_dbg(gpu_dbg_gpu_dbg,
6908 "context wr: offset=0x%x v=0x%x",
6911 if (ctx_ops[i].op == REGOP(WRITE_64)) {
6912 v = gk20a_mem_rd32(ctx_ptr + offsets[j] + 4, 0);
6913 v &= ~ctx_ops[i].and_n_mask_hi;
6914 v |= ctx_ops[i].value_hi;
6915 gk20a_mem_wr32(ctx_ptr + offsets[j] + 4, 0, v);
6917 gk20a_dbg(gpu_dbg_gpu_dbg,
6918 "context wr: offset=0x%x v=0x%x",
6922 /* check to see if we need to add a special WAR
6923 for some of the SMPC perf regs */
6924 gr_gk20a_ctx_patch_smpc(g, ch_ctx, offset_addrs[j],
6927 } else { /* read pass */
6928 ctx_ops[i].value_lo =
6929 gk20a_mem_rd32(ctx_ptr + offsets[0], 0);
6931 gk20a_dbg(gpu_dbg_gpu_dbg, "context rd: offset=0x%x v=0x%x",
6932 offsets[0], ctx_ops[i].value_lo);
6934 if (ctx_ops[i].op == REGOP(READ_64)) {
6935 ctx_ops[i].value_hi =
6936 gk20a_mem_rd32(ctx_ptr + offsets[0] + 4, 0);
6938 gk20a_dbg(gpu_dbg_gpu_dbg,
6939 "context rd: offset=0x%x v=0x%x",
6940 offsets[0] + 4, ctx_ops[i].value_hi);
6942 ctx_ops[i].value_hi = 0;
6949 /* flush cpu caches for the ctx buffer? only if cpu cached, of course.
6950 * they aren't, yet */
6952 FLUSH_CPU_DCACHE(ctx_ptr,
6953 sg_phys(ch_ctx->gr_ctx.mem.ref), size);
6964 if (restart_gr_ctxsw) {
6965 int tmp_err = gr_gk20a_enable_ctxsw(g);
6967 gk20a_err(dev_from_gk20a(g), "unable to restart ctxsw!\n");
6975 static void gr_gk20a_cb_size_default(struct gk20a *g)
6977 struct gr_gk20a *gr = &g->gr;
6979 if (!gr->attrib_cb_default_size)
6980 gr->attrib_cb_default_size =
6981 gr_gpc0_ppc0_cbm_cfg_size_default_v();
6982 gr->alpha_cb_default_size =
6983 gr_gpc0_ppc0_cbm_cfg2_size_default_v();
6986 static int gr_gk20a_calc_global_ctx_buffer_size(struct gk20a *g)
6988 struct gr_gk20a *gr = &g->gr;
6991 gr->attrib_cb_size = gr->attrib_cb_default_size;
6992 gr->alpha_cb_size = gr->alpha_cb_default_size
6993 + (gr->alpha_cb_default_size >> 1);
6995 size = gr->attrib_cb_size *
6996 gr_gpc0_ppc0_cbm_cfg_size_granularity_v() *
6999 size += gr->alpha_cb_size *
7000 gr_gpc0_ppc0_cbm_cfg2_size_granularity_v() *
7006 void gr_gk20a_commit_global_pagepool(struct gk20a *g,
7007 struct channel_ctx_gk20a *ch_ctx,
7008 u64 addr, u32 size, bool patch)
7010 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_base_r(),
7011 gr_scc_pagepool_base_addr_39_8_f(addr), patch);
7013 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_r(),
7014 gr_scc_pagepool_total_pages_f(size) |
7015 gr_scc_pagepool_valid_true_f(), patch);
7017 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_base_r(),
7018 gr_gpcs_gcc_pagepool_base_addr_39_8_f(addr), patch);
7020 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_r(),
7021 gr_gpcs_gcc_pagepool_total_pages_f(size), patch);
7023 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_pagepool_r(),
7024 gr_pd_pagepool_total_pages_f(size) |
7025 gr_pd_pagepool_valid_true_f(), patch);
7028 void gk20a_init_gr(struct gk20a *g)
7030 init_waitqueue_head(&g->gr.init_wq);
7033 static bool gr_gk20a_is_tpc_addr(u32 addr)
7035 return ((addr >= proj_tpc_in_gpc_base_v()) &&
7036 (addr < proj_tpc_in_gpc_base_v() +
7037 (proj_scal_litter_num_tpc_per_gpc_v() *
7038 proj_tpc_in_gpc_stride_v())))
7039 || pri_is_tpc_addr_shared(addr);
7042 static u32 gr_gk20a_get_tpc_num(u32 addr)
7045 u32 num_tpcs = proj_scal_litter_num_tpc_per_gpc_v();
7047 for (i = 0; i < num_tpcs; i++) {
7048 start = proj_tpc_in_gpc_base_v() +
7049 (i * proj_tpc_in_gpc_stride_v());
7050 if ((addr >= start) &&
7051 (addr < (start + proj_tpc_in_gpc_stride_v())))
7057 static int gk20a_gr_wait_for_sm_lock_down(struct gk20a *g, u32 gpc, u32 tpc,
7058 u32 global_esr_mask, bool check_errors)
7060 unsigned long end_jiffies = jiffies +
7061 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
7062 u32 delay = GR_IDLE_CHECK_DEFAULT;
7063 bool mmu_debug_mode_enabled = g->ops.mm.is_debug_mode_enabled(g);
7065 proj_gpc_stride_v() * gpc + proj_tpc_in_gpc_stride_v() * tpc;
7067 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
7068 "GPC%d TPC%d: locking down SM", gpc, tpc);
7070 /* wait for the sm to lock down */
7072 u32 global_esr = gk20a_readl(g,
7073 gr_gpc0_tpc0_sm_hww_global_esr_r() + offset);
7074 u32 warp_esr = gk20a_readl(g,
7075 gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset);
7076 u32 dbgr_status0 = gk20a_readl(g,
7077 gr_gpc0_tpc0_sm_dbgr_status0_r() + offset);
7079 (gr_gpc0_tpc0_sm_dbgr_status0_locked_down_v(dbgr_status0) ==
7080 gr_gpc0_tpc0_sm_dbgr_status0_locked_down_true_v());
7081 bool no_error_pending =
7083 (gr_gpc0_tpc0_sm_hww_warp_esr_error_v(warp_esr) ==
7084 gr_gpc0_tpc0_sm_hww_warp_esr_error_none_v()) &&
7085 ((global_esr & ~global_esr_mask) == 0);
7087 if (locked_down || no_error_pending) {
7088 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
7089 "GPC%d TPC%d: locked down SM", gpc, tpc);
7093 /* if an mmu fault is pending and mmu debug mode is not
7094 * enabled, the sm will never lock down. */
7095 if (!mmu_debug_mode_enabled &&
7096 gk20a_fifo_mmu_fault_pending(g)) {
7097 gk20a_err(dev_from_gk20a(g),
7098 "GPC%d TPC%d: mmu fault pending,"
7099 " sm will never lock down!", gpc, tpc);
7103 usleep_range(delay, delay * 2);
7104 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
7106 } while (time_before(jiffies, end_jiffies)
7107 || !tegra_platform_is_silicon());
7109 gk20a_err(dev_from_gk20a(g),
7110 "GPC%d TPC%d: timed out while trying to lock down SM",
7116 void gk20a_suspend_all_sms(struct gk20a *g)
7118 struct gr_gk20a *gr = &g->gr;
7123 /* if an SM debugger isn't attached, skip suspend */
7124 if (!gk20a_gr_sm_debugger_attached(g)) {
7125 gk20a_err(dev_from_gk20a(g), "SM debugger not attached, "
7126 "skipping suspend!\n");
7130 /* assert stop trigger. uniformity assumption: all SMs will have
7131 * the same state in dbg_control0. */
7133 gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
7134 dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f();
7136 /* broadcast write */
7138 gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0);
7140 for (gpc = 0; gpc < gr->gpc_count; gpc++) {
7141 for (tpc = 0; tpc < gr->tpc_count; tpc++) {
7143 gk20a_gr_wait_for_sm_lock_down(g, gpc, tpc, 0, false);
7145 gk20a_err(dev_from_gk20a(g),
7146 "SuspendAllSms failed\n");
7153 void gk20a_resume_all_sms(struct gk20a *g)
7157 * The following requires some clarification. Despite the fact that both
7158 * RUN_TRIGGER and STOP_TRIGGER have the word "TRIGGER" in their
7159 * names, only one is actually a trigger, and that is the STOP_TRIGGER.
7160 * Merely writing a 1(_TASK) to the RUN_TRIGGER is not sufficient to
7161 * resume the gpu - the _STOP_TRIGGER must explicitly be set to 0
7162 * (_DISABLE) as well.
7164 * Advice from the arch group: Disable the stop trigger first, as a
7165 * separate operation, in order to ensure that the trigger has taken
7166 * effect, before enabling the run trigger.
7169 /*De-assert stop trigger */
7171 gk20a_readl(g, gr_gpcs_tpcs_sm_dbgr_control0_r());
7172 dbgr_control0 &= ~gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f();
7174 gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0);
7177 dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_run_trigger_task_f();
7179 gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0);
7182 static u32 gr_gk20a_pagepool_default_size(struct gk20a *g)
7184 return gr_scc_pagepool_total_pages_hwmax_value_v();
7187 static u32 gr_gk20a_get_max_fbps_count(struct gk20a *g)
7189 u32 max_fbps_count, tmp;
7190 tmp = gk20a_readl(g, top_num_fbps_r());
7191 max_fbps_count = top_num_fbps_value_v(tmp);
7192 return max_fbps_count;
7196 static u32 gr_gk20a_get_fbp_en_mask(struct gk20a *g)
7198 u32 fbp_en_mask, opt_fbio;
7199 opt_fbio = gk20a_readl(g, top_fs_status_fbp_r());
7200 fbp_en_mask = top_fs_status_fbp_cluster_v(opt_fbio);
7204 static u32 gr_gk20a_get_max_ltc_per_fbp(struct gk20a *g)
7209 static u32 gr_gk20a_get_max_lts_per_ltc(struct gk20a *g)
7214 static u32 *gr_gk20a_rop_l2_en_mask(struct gk20a *g)
7216 /* gk20a doesnt have rop_l2_en_mask */
7222 static int gr_gk20a_dump_gr_status_regs(struct gk20a *g,
7223 struct gk20a_debug_output *o)
7225 gk20a_debug_output(o, "NV_PGRAPH_STATUS: 0x%x\n",
7226 gk20a_readl(g, gr_status_r()));
7227 gk20a_debug_output(o, "NV_PGRAPH_STATUS1: 0x%x\n",
7228 gk20a_readl(g, gr_status_1_r()));
7229 gk20a_debug_output(o, "NV_PGRAPH_STATUS2: 0x%x\n",
7230 gk20a_readl(g, gr_status_2_r()));
7231 gk20a_debug_output(o, "NV_PGRAPH_ENGINE_STATUS: 0x%x\n",
7232 gk20a_readl(g, gr_engine_status_r()));
7233 gk20a_debug_output(o, "NV_PGRAPH_GRFIFO_STATUS : 0x%x\n",
7234 gk20a_readl(g, gr_gpfifo_status_r()));
7235 gk20a_debug_output(o, "NV_PGRAPH_GRFIFO_CONTROL : 0x%x\n",
7236 gk20a_readl(g, gr_gpfifo_ctl_r()));
7237 gk20a_debug_output(o, "NV_PGRAPH_PRI_FECS_HOST_INT_STATUS : 0x%x\n",
7238 gk20a_readl(g, gr_fecs_host_int_status_r()));
7239 gk20a_debug_output(o, "NV_PGRAPH_EXCEPTION : 0x%x\n",
7240 gk20a_readl(g, gr_exception_r()));
7241 gk20a_debug_output(o, "NV_PGRAPH_FECS_INTR : 0x%x\n",
7242 gk20a_readl(g, gr_fecs_intr_r()));
7243 gk20a_debug_output(o, "NV_PFIFO_ENGINE_STATUS(GR) : 0x%x\n",
7244 gk20a_readl(g, fifo_engine_status_r(ENGINE_GR_GK20A)));
7245 gk20a_debug_output(o, "NV_PGRAPH_ACTIVITY0: 0x%x\n",
7246 gk20a_readl(g, gr_activity_0_r()));
7247 gk20a_debug_output(o, "NV_PGRAPH_ACTIVITY1: 0x%x\n",
7248 gk20a_readl(g, gr_activity_1_r()));
7249 gk20a_debug_output(o, "NV_PGRAPH_ACTIVITY2: 0x%x\n",
7250 gk20a_readl(g, gr_activity_2_r()));
7251 gk20a_debug_output(o, "NV_PGRAPH_ACTIVITY4: 0x%x\n",
7252 gk20a_readl(g, gr_activity_4_r()));
7253 gk20a_debug_output(o, "NV_PGRAPH_PRI_SKED_ACTIVITY: 0x%x\n",
7254 gk20a_readl(g, gr_pri_sked_activity_r()));
7255 gk20a_debug_output(o, "NV_PGRAPH_PRI_GPC0_GPCCS_GPC_ACTIVITY0: 0x%x\n",
7256 gk20a_readl(g, gr_pri_gpc0_gpccs_gpc_activity0_r()));
7257 gk20a_debug_output(o, "NV_PGRAPH_PRI_GPC0_GPCCS_GPC_ACTIVITY1: 0x%x\n",
7258 gk20a_readl(g, gr_pri_gpc0_gpccs_gpc_activity1_r()));
7259 gk20a_debug_output(o, "NV_PGRAPH_PRI_GPC0_GPCCS_GPC_ACTIVITY2: 0x%x\n",
7260 gk20a_readl(g, gr_pri_gpc0_gpccs_gpc_activity2_r()));
7261 gk20a_debug_output(o, "NV_PGRAPH_PRI_GPC0_GPCCS_GPC_ACTIVITY3: 0x%x\n",
7262 gk20a_readl(g, gr_pri_gpc0_gpccs_gpc_activity3_r()));
7263 gk20a_debug_output(o, "NV_PGRAPH_PRI_GPC0_TPC0_TPCCS_TPC_ACTIVITY0: 0x%x\n",
7264 gk20a_readl(g, gr_pri_gpc0_tpc0_tpccs_tpc_activity_0_r()));
7265 gk20a_debug_output(o, "NV_PGRAPH_PRI_GPC0_TPCS_TPCCS_TPC_ACTIVITY0: 0x%x\n",
7266 gk20a_readl(g, gr_pri_gpc0_tpcs_tpccs_tpc_activity_0_r()));
7267 gk20a_debug_output(o, "NV_PGRAPH_PRI_GPCS_GPCCS_GPC_ACTIVITY0: 0x%x\n",
7268 gk20a_readl(g, gr_pri_gpcs_gpccs_gpc_activity_0_r()));
7269 gk20a_debug_output(o, "NV_PGRAPH_PRI_GPCS_GPCCS_GPC_ACTIVITY1: 0x%x\n",
7270 gk20a_readl(g, gr_pri_gpcs_gpccs_gpc_activity_1_r()));
7271 gk20a_debug_output(o, "NV_PGRAPH_PRI_GPCS_GPCCS_GPC_ACTIVITY2: 0x%x\n",
7272 gk20a_readl(g, gr_pri_gpcs_gpccs_gpc_activity_2_r()));
7273 gk20a_debug_output(o, "NV_PGRAPH_PRI_GPCS_GPCCS_GPC_ACTIVITY3: 0x%x\n",
7274 gk20a_readl(g, gr_pri_gpcs_gpccs_gpc_activity_3_r()));
7275 gk20a_debug_output(o, "NV_PGRAPH_PRI_GPCS_TPC0_TPCCS_TPC_ACTIVITY0: 0x%x\n",
7276 gk20a_readl(g, gr_pri_gpcs_tpc0_tpccs_tpc_activity_0_r()));
7277 gk20a_debug_output(o, "NV_PGRAPH_PRI_GPCS_TPCS_TPCCS_TPC_ACTIVITY0: 0x%x\n",
7278 gk20a_readl(g, gr_pri_gpcs_tpcs_tpccs_tpc_activity_0_r()));
7279 gk20a_debug_output(o, "NV_PGRAPH_PRI_BE0_BECS_BE_ACTIVITY0: 0x%x\n",
7280 gk20a_readl(g, gr_pri_be0_becs_be_activity0_r()));
7281 gk20a_debug_output(o, "NV_PGRAPH_PRI_BES_BECS_BE_ACTIVITY0: 0x%x\n",
7282 gk20a_readl(g, gr_pri_bes_becs_be_activity0_r()));
7283 gk20a_debug_output(o, "NV_PGRAPH_PRI_DS_MPIPE_STATUS: 0x%x\n",
7284 gk20a_readl(g, gr_pri_ds_mpipe_status_r()));
7285 gk20a_debug_output(o, "NV_PGRAPH_PRI_FE_GO_IDLE_ON_STATUS: 0x%x\n",
7286 gk20a_readl(g, gr_pri_fe_go_idle_on_status_r()));
7287 gk20a_debug_output(o, "NV_PGRAPH_PRI_FE_GO_IDLE_TIMEOUT : 0x%x\n",
7288 gk20a_readl(g, gr_fe_go_idle_timeout_r()));
7289 gk20a_debug_output(o, "NV_PGRAPH_PRI_FE_GO_IDLE_CHECK : 0x%x\n",
7290 gk20a_readl(g, gr_pri_fe_go_idle_check_r()));
7291 gk20a_debug_output(o, "NV_PGRAPH_PRI_FE_GO_IDLE_INFO : 0x%x\n",
7292 gk20a_readl(g, gr_pri_fe_go_idle_info_r()));
7293 gk20a_debug_output(o, "NV_PGRAPH_PRI_GPC0_TPC0_TEX_M_TEX_SUBUNITS_STATUS: 0x%x\n",
7294 gk20a_readl(g, gr_pri_gpc0_tpc0_tex_m_tex_subunits_status_r()));
7295 gk20a_debug_output(o, "NV_PGRAPH_PRI_FECS_CTXSW_STATUS_FE_0: 0x%x\n",
7296 gk20a_readl(g, gr_fecs_ctxsw_status_fe_0_r()));
7297 gk20a_debug_output(o, "NV_PGRAPH_PRI_FECS_CTXSW_STATUS_1: 0x%x\n",
7298 gk20a_readl(g, gr_fecs_ctxsw_status_1_r()));
7299 gk20a_debug_output(o, "NV_PGRAPH_PRI_GPC0_GPCCS_CTXSW_STATUS_GPC_0: 0x%x\n",
7300 gk20a_readl(g, gr_gpc0_gpccs_ctxsw_status_gpc_0_r()));
7301 gk20a_debug_output(o, "NV_PGRAPH_PRI_GPC0_GPCCS_CTXSW_STATUS_1: 0x%x\n",
7302 gk20a_readl(g, gr_gpc0_gpccs_ctxsw_status_1_r()));
7303 gk20a_debug_output(o, "NV_PGRAPH_PRI_FECS_CTXSW_IDLESTATE : 0x%x\n",
7304 gk20a_readl(g, gr_fecs_ctxsw_idlestate_r()));
7305 gk20a_debug_output(o, "NV_PGRAPH_PRI_GPC0_GPCCS_CTXSW_IDLESTATE : 0x%x\n",
7306 gk20a_readl(g, gr_gpc0_gpccs_ctxsw_idlestate_r()));
7307 gk20a_debug_output(o, "NV_PGRAPH_PRI_FECS_CURRENT_CTX : 0x%x\n",
7308 gk20a_readl(g, gr_fecs_current_ctx_r()));
7309 gk20a_debug_output(o, "NV_PGRAPH_PRI_FECS_NEW_CTX : 0x%x\n",
7310 gk20a_readl(g, gr_fecs_new_ctx_r()));
7311 gk20a_debug_output(o, "NV_PGRAPH_PRI_BE0_CROP_STATUS1 : 0x%x\n",
7312 gk20a_readl(g, gr_pri_be0_crop_status1_r()));
7313 gk20a_debug_output(o, "NV_PGRAPH_PRI_BES_CROP_STATUS1 : 0x%x\n",
7314 gk20a_readl(g, gr_pri_bes_crop_status1_r()));
7315 gk20a_debug_output(o, "NV_PGRAPH_PRI_BE0_ZROP_STATUS : 0x%x\n",
7316 gk20a_readl(g, gr_pri_be0_zrop_status_r()));
7317 gk20a_debug_output(o, "NV_PGRAPH_PRI_BE0_ZROP_STATUS2 : 0x%x\n",
7318 gk20a_readl(g, gr_pri_be0_zrop_status2_r()));
7319 gk20a_debug_output(o, "NV_PGRAPH_PRI_BES_ZROP_STATUS : 0x%x\n",
7320 gk20a_readl(g, gr_pri_bes_zrop_status_r()));
7321 gk20a_debug_output(o, "NV_PGRAPH_PRI_BES_ZROP_STATUS2 : 0x%x\n",
7322 gk20a_readl(g, gr_pri_bes_zrop_status2_r()));
7323 gk20a_debug_output(o, "NV_PGRAPH_PRI_BE0_BECS_BE_EXCEPTION: 0x%x\n",
7324 gk20a_readl(g, gr_pri_be0_becs_be_exception_r()));
7325 gk20a_debug_output(o, "NV_PGRAPH_PRI_BE0_BECS_BE_EXCEPTION_EN: 0x%x\n",
7326 gk20a_readl(g, gr_pri_be0_becs_be_exception_en_r()));
7327 gk20a_debug_output(o, "NV_PGRAPH_PRI_GPC0_GPCCS_GPC_EXCEPTION: 0x%x\n",
7328 gk20a_readl(g, gr_pri_gpc0_gpccs_gpc_exception_r()));
7329 gk20a_debug_output(o, "NV_PGRAPH_PRI_GPC0_GPCCS_GPC_EXCEPTION_EN: 0x%x\n",
7330 gk20a_readl(g, gr_pri_gpc0_gpccs_gpc_exception_en_r()));
7331 gk20a_debug_output(o, "NV_PGRAPH_PRI_GPC0_TPC0_TPCCS_TPC_EXCEPTION: 0x%x\n",
7332 gk20a_readl(g, gr_pri_gpc0_tpc0_tpccs_tpc_exception_r()));
7333 gk20a_debug_output(o, "NV_PGRAPH_PRI_GPC0_TPC0_TPCCS_TPC_EXCEPTION_EN: 0x%x\n",
7334 gk20a_readl(g, gr_pri_gpc0_tpc0_tpccs_tpc_exception_en_r()));
7338 int gr_gk20a_debugfs_init(struct gk20a *g)
7340 struct gk20a_platform *platform = platform_get_drvdata(g->dev);
7342 g->debugfs_gr_default_attrib_cb_size =
7343 debugfs_create_u32("gr_default_attrib_cb_size",
7344 S_IRUGO|S_IWUSR, platform->debugfs,
7345 &g->gr.attrib_cb_default_size);
7350 static void gr_gk20a_init_cyclestats(struct gk20a *g)
7352 #if defined(CONFIG_GK20A_CYCLE_STATS)
7353 g->gpu_characteristics.flags |=
7354 NVGPU_GPU_FLAGS_SUPPORT_CYCLE_STATS;
7360 void gk20a_init_gr_ops(struct gpu_ops *gops)
7362 gops->gr.access_smpc_reg = gr_gk20a_access_smpc_reg;
7363 gops->gr.bundle_cb_defaults = gr_gk20a_bundle_cb_defaults;
7364 gops->gr.cb_size_default = gr_gk20a_cb_size_default;
7365 gops->gr.calc_global_ctx_buffer_size =
7366 gr_gk20a_calc_global_ctx_buffer_size;
7367 gops->gr.commit_global_attrib_cb = gr_gk20a_commit_global_attrib_cb;
7368 gops->gr.commit_global_bundle_cb = gr_gk20a_commit_global_bundle_cb;
7369 gops->gr.commit_global_cb_manager = gr_gk20a_commit_global_cb_manager;
7370 gops->gr.commit_global_pagepool = gr_gk20a_commit_global_pagepool;
7371 gops->gr.handle_sw_method = gr_gk20a_handle_sw_method;
7372 gops->gr.set_alpha_circular_buffer_size =
7373 gk20a_gr_set_circular_buffer_size;
7374 gops->gr.set_circular_buffer_size =
7375 gk20a_gr_set_alpha_circular_buffer_size;
7376 gops->gr.enable_hww_exceptions = gr_gk20a_enable_hww_exceptions;
7377 gops->gr.is_valid_class = gr_gk20a_is_valid_class;
7378 gops->gr.get_sm_dsm_perf_regs = gr_gk20a_get_sm_dsm_perf_regs;
7379 gops->gr.get_sm_dsm_perf_ctrl_regs = gr_gk20a_get_sm_dsm_perf_ctrl_regs;
7380 gops->gr.init_fs_state = gr_gk20a_ctx_state_floorsweep;
7381 gops->gr.set_hww_esr_report_mask = gr_gk20a_set_hww_esr_report_mask;
7382 gops->gr.setup_alpha_beta_tables = gr_gk20a_setup_alpha_beta_tables;
7383 gops->gr.falcon_load_ucode = gr_gk20a_load_ctxsw_ucode_segments;
7384 gops->gr.load_ctxsw_ucode = gr_gk20a_load_ctxsw_ucode;
7385 gops->gr.get_gpc_tpc_mask = gr_gk20a_get_gpc_tpc_mask;
7386 gops->gr.free_channel_ctx = gk20a_free_channel_ctx;
7387 gops->gr.alloc_obj_ctx = gk20a_alloc_obj_ctx;
7388 gops->gr.free_obj_ctx = gk20a_free_obj_ctx;
7389 gops->gr.bind_ctxsw_zcull = gr_gk20a_bind_ctxsw_zcull;
7390 gops->gr.get_zcull_info = gr_gk20a_get_zcull_info;
7391 gops->gr.is_tpc_addr = gr_gk20a_is_tpc_addr;
7392 gops->gr.get_tpc_num = gr_gk20a_get_tpc_num;
7393 gops->gr.detect_sm_arch = gr_gk20a_detect_sm_arch;
7394 gops->gr.add_zbc_color = gr_gk20a_add_zbc_color;
7395 gops->gr.add_zbc_depth = gr_gk20a_add_zbc_depth;
7396 gops->gr.zbc_set_table = gk20a_gr_zbc_set_table;
7397 gops->gr.zbc_query_table = gr_gk20a_query_zbc;
7398 gops->gr.pagepool_default_size = gr_gk20a_pagepool_default_size;
7399 gops->gr.init_ctx_state = gr_gk20a_init_ctx_state;
7400 gops->gr.alloc_gr_ctx = gr_gk20a_alloc_gr_ctx;
7401 gops->gr.free_gr_ctx = gr_gk20a_free_gr_ctx;
7402 gops->gr.dump_gr_regs = gr_gk20a_dump_gr_status_regs;
7403 gops->gr.get_max_fbps_count = gr_gk20a_get_max_fbps_count;
7404 gops->gr.get_fbp_en_mask = gr_gk20a_get_fbp_en_mask;
7405 gops->gr.get_max_ltc_per_fbp = gr_gk20a_get_max_ltc_per_fbp;
7406 gops->gr.get_max_lts_per_ltc = gr_gk20a_get_max_lts_per_ltc;
7407 gops->gr.get_rop_l2_en_mask = gr_gk20a_rop_l2_en_mask;
7408 gops->gr.init_sm_dsm_reg_info = gr_gk20a_init_sm_dsm_reg_info;
7409 gops->gr.wait_empty = gr_gk20a_wait_idle;
7410 gops->gr.init_cyclestats = gr_gk20a_init_cyclestats;