]> rtime.felk.cvut.cz Git - sojka/nv-tegra/linux-3.10.git/blob - drivers/gpu/nvgpu/gk20a/gr_gk20a.c
gpu: nvgpu: implement mapping for sparse allocation
[sojka/nv-tegra/linux-3.10.git] / drivers / gpu / nvgpu / gk20a / gr_gk20a.c
1 /*
2  * GK20A Graphics
3  *
4  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program; if not, write to the Free Software Foundation, Inc.,
17  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
18  */
19
20 #include <linux/delay.h>        /* for udelay */
21 #include <linux/mm.h>           /* for totalram_pages */
22 #include <linux/scatterlist.h>
23 #include <linux/tegra-soc.h>
24 #include <linux/nvhost_dbg_gpu_ioctl.h>
25 #include <linux/vmalloc.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/firmware.h>
28 #include <linux/nvhost.h>
29
30 #include "gk20a.h"
31 #include "kind_gk20a.h"
32 #include "gr_ctx_gk20a.h"
33
34 #include "hw_ccsr_gk20a.h"
35 #include "hw_ctxsw_prog_gk20a.h"
36 #include "hw_fifo_gk20a.h"
37 #include "hw_gr_gk20a.h"
38 #include "hw_gmmu_gk20a.h"
39 #include "hw_mc_gk20a.h"
40 #include "hw_ram_gk20a.h"
41 #include "hw_pri_ringmaster_gk20a.h"
42 #include "hw_pri_ringstation_sys_gk20a.h"
43 #include "hw_pri_ringstation_gpc_gk20a.h"
44 #include "hw_pri_ringstation_fbp_gk20a.h"
45 #include "hw_proj_gk20a.h"
46 #include "hw_top_gk20a.h"
47 #include "hw_ltc_gk20a.h"
48 #include "hw_fb_gk20a.h"
49 #include "hw_therm_gk20a.h"
50 #include "hw_pbdma_gk20a.h"
51 #include "gr_pri_gk20a.h"
52 #include "regops_gk20a.h"
53 #include "dbg_gpu_gk20a.h"
54
55 #define BLK_SIZE (256)
56
57 static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va);
58
59 /* global ctx buffer */
60 static int  gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g);
61 static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g);
62 static int  gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
63                                             struct channel_gk20a *c);
64 static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c);
65
66 /* channel gr ctx buffer */
67 static int  gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
68                                         struct channel_gk20a *c);
69 static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c);
70
71 /* channel patch ctx buffer */
72 static int  gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
73                                         struct channel_gk20a *c);
74 static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c);
75
76 /* golden ctx image */
77 static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
78                                           struct channel_gk20a *c);
79 static int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
80                                           struct channel_gk20a *c);
81
82 void gk20a_fecs_dump_falcon_stats(struct gk20a *g)
83 {
84         int i;
85
86         gk20a_err(dev_from_gk20a(g), "gr_fecs_os_r : %d",
87                 gk20a_readl(g, gr_fecs_os_r()));
88         gk20a_err(dev_from_gk20a(g), "gr_fecs_cpuctl_r : 0x%x",
89                 gk20a_readl(g, gr_fecs_cpuctl_r()));
90         gk20a_err(dev_from_gk20a(g), "gr_fecs_idlestate_r : 0x%x",
91                 gk20a_readl(g, gr_fecs_idlestate_r()));
92         gk20a_err(dev_from_gk20a(g), "gr_fecs_mailbox0_r : 0x%x",
93                 gk20a_readl(g, gr_fecs_mailbox0_r()));
94         gk20a_err(dev_from_gk20a(g), "gr_fecs_mailbox1_r : 0x%x",
95                 gk20a_readl(g, gr_fecs_mailbox1_r()));
96         gk20a_err(dev_from_gk20a(g), "gr_fecs_irqstat_r : 0x%x",
97                 gk20a_readl(g, gr_fecs_irqstat_r()));
98         gk20a_err(dev_from_gk20a(g), "gr_fecs_irqmode_r : 0x%x",
99                 gk20a_readl(g, gr_fecs_irqmode_r()));
100         gk20a_err(dev_from_gk20a(g), "gr_fecs_irqmask_r : 0x%x",
101                 gk20a_readl(g, gr_fecs_irqmask_r()));
102         gk20a_err(dev_from_gk20a(g), "gr_fecs_irqdest_r : 0x%x",
103                 gk20a_readl(g, gr_fecs_irqdest_r()));
104         gk20a_err(dev_from_gk20a(g), "gr_fecs_debug1_r : 0x%x",
105                 gk20a_readl(g, gr_fecs_debug1_r()));
106         gk20a_err(dev_from_gk20a(g), "gr_fecs_debuginfo_r : 0x%x",
107                 gk20a_readl(g, gr_fecs_debuginfo_r()));
108
109         for (i = 0; i < gr_fecs_ctxsw_mailbox__size_1_v(); i++)
110                 gk20a_err(dev_from_gk20a(g), "gr_fecs_ctxsw_mailbox_r(%d) : 0x%x",
111                         i, gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(i)));
112
113         gk20a_err(dev_from_gk20a(g), "gr_fecs_engctl_r : 0x%x",
114                 gk20a_readl(g, gr_fecs_engctl_r()));
115         gk20a_err(dev_from_gk20a(g), "gr_fecs_curctx_r : 0x%x",
116                 gk20a_readl(g, gr_fecs_curctx_r()));
117         gk20a_err(dev_from_gk20a(g), "gr_fecs_nxtctx_r : 0x%x",
118                 gk20a_readl(g, gr_fecs_nxtctx_r()));
119
120         gk20a_writel(g, gr_fecs_icd_cmd_r(),
121                 gr_fecs_icd_cmd_opc_rreg_f() |
122                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_IMB));
123         gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_IMB : 0x%x",
124                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
125
126         gk20a_writel(g, gr_fecs_icd_cmd_r(),
127                 gr_fecs_icd_cmd_opc_rreg_f() |
128                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_DMB));
129         gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_DMB : 0x%x",
130                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
131
132         gk20a_writel(g, gr_fecs_icd_cmd_r(),
133                 gr_fecs_icd_cmd_opc_rreg_f() |
134                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CSW));
135         gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_CSW : 0x%x",
136                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
137
138         gk20a_writel(g, gr_fecs_icd_cmd_r(),
139                 gr_fecs_icd_cmd_opc_rreg_f() |
140                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CTX));
141         gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_CTX : 0x%x",
142                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
143
144         gk20a_writel(g, gr_fecs_icd_cmd_r(),
145                 gr_fecs_icd_cmd_opc_rreg_f() |
146                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_EXCI));
147         gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_EXCI : 0x%x",
148                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
149
150         for (i = 0; i < 4; i++) {
151                 gk20a_writel(g, gr_fecs_icd_cmd_r(),
152                         gr_fecs_icd_cmd_opc_rreg_f() |
153                         gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_PC));
154                 gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_PC : 0x%x",
155                         gk20a_readl(g, gr_fecs_icd_rdata_r()));
156
157                 gk20a_writel(g, gr_fecs_icd_cmd_r(),
158                         gr_fecs_icd_cmd_opc_rreg_f() |
159                         gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_SP));
160                 gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_SP : 0x%x",
161                         gk20a_readl(g, gr_fecs_icd_rdata_r()));
162         }
163 }
164
165 static void gr_gk20a_load_falcon_dmem(struct gk20a *g)
166 {
167         u32 i, ucode_u32_size;
168         const u32 *ucode_u32_data;
169         u32 checksum;
170
171         gk20a_dbg_fn("");
172
173         gk20a_writel(g, gr_gpccs_dmemc_r(0), (gr_gpccs_dmemc_offs_f(0) |
174                                               gr_gpccs_dmemc_blk_f(0)  |
175                                               gr_gpccs_dmemc_aincw_f(1)));
176
177         ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.data.count;
178         ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.data.l;
179
180         for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
181                 gk20a_writel(g, gr_gpccs_dmemd_r(0), ucode_u32_data[i]);
182                 checksum += ucode_u32_data[i];
183         }
184
185         gk20a_writel(g, gr_fecs_dmemc_r(0), (gr_fecs_dmemc_offs_f(0) |
186                                              gr_fecs_dmemc_blk_f(0)  |
187                                              gr_fecs_dmemc_aincw_f(1)));
188
189         ucode_u32_size = g->gr.ctx_vars.ucode.fecs.data.count;
190         ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.data.l;
191
192         for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
193                 gk20a_writel(g, gr_fecs_dmemd_r(0), ucode_u32_data[i]);
194                 checksum += ucode_u32_data[i];
195         }
196         gk20a_dbg_fn("done");
197 }
198
199 static void gr_gk20a_load_falcon_imem(struct gk20a *g)
200 {
201         u32 cfg, fecs_imem_size, gpccs_imem_size, ucode_u32_size;
202         const u32 *ucode_u32_data;
203         u32 tag, i, pad_start, pad_end;
204         u32 checksum;
205
206         gk20a_dbg_fn("");
207
208         cfg = gk20a_readl(g, gr_fecs_cfg_r());
209         fecs_imem_size = gr_fecs_cfg_imem_sz_v(cfg);
210
211         cfg = gk20a_readl(g, gr_gpc0_cfg_r());
212         gpccs_imem_size = gr_gpc0_cfg_imem_sz_v(cfg);
213
214         /* Use the broadcast address to access all of the GPCCS units. */
215         gk20a_writel(g, gr_gpccs_imemc_r(0), (gr_gpccs_imemc_offs_f(0) |
216                                               gr_gpccs_imemc_blk_f(0) |
217                                               gr_gpccs_imemc_aincw_f(1)));
218
219         /* Setup the tags for the instruction memory. */
220         tag = 0;
221         gk20a_writel(g, gr_gpccs_imemt_r(0), gr_gpccs_imemt_tag_f(tag));
222
223         ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.inst.count;
224         ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.inst.l;
225
226         for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
227                 if (i && ((i % (256/sizeof(u32))) == 0)) {
228                         tag++;
229                         gk20a_writel(g, gr_gpccs_imemt_r(0),
230                                       gr_gpccs_imemt_tag_f(tag));
231                 }
232                 gk20a_writel(g, gr_gpccs_imemd_r(0), ucode_u32_data[i]);
233                 checksum += ucode_u32_data[i];
234         }
235
236         pad_start = i*4;
237         pad_end = pad_start+(256-pad_start%256)+256;
238         for (i = pad_start;
239              (i < gpccs_imem_size * 256) && (i < pad_end);
240              i += 4) {
241                 if (i && ((i % 256) == 0)) {
242                         tag++;
243                         gk20a_writel(g, gr_gpccs_imemt_r(0),
244                                       gr_gpccs_imemt_tag_f(tag));
245                 }
246                 gk20a_writel(g, gr_gpccs_imemd_r(0), 0);
247         }
248
249         gk20a_writel(g, gr_fecs_imemc_r(0), (gr_fecs_imemc_offs_f(0) |
250                                              gr_fecs_imemc_blk_f(0) |
251                                              gr_fecs_imemc_aincw_f(1)));
252
253         /* Setup the tags for the instruction memory. */
254         tag = 0;
255         gk20a_writel(g, gr_fecs_imemt_r(0), gr_fecs_imemt_tag_f(tag));
256
257         ucode_u32_size = g->gr.ctx_vars.ucode.fecs.inst.count;
258         ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.inst.l;
259
260         for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
261                 if (i && ((i % (256/sizeof(u32))) == 0)) {
262                         tag++;
263                         gk20a_writel(g, gr_fecs_imemt_r(0),
264                                       gr_fecs_imemt_tag_f(tag));
265                 }
266                 gk20a_writel(g, gr_fecs_imemd_r(0), ucode_u32_data[i]);
267                 checksum += ucode_u32_data[i];
268         }
269
270         pad_start = i*4;
271         pad_end = pad_start+(256-pad_start%256)+256;
272         for (i = pad_start; (i < fecs_imem_size * 256) && i < pad_end; i += 4) {
273                 if (i && ((i % 256) == 0)) {
274                         tag++;
275                         gk20a_writel(g, gr_fecs_imemt_r(0),
276                                       gr_fecs_imemt_tag_f(tag));
277                 }
278                 gk20a_writel(g, gr_fecs_imemd_r(0), 0);
279         }
280 }
281
282 static int gr_gk20a_wait_idle(struct gk20a *g, unsigned long end_jiffies,
283                 u32 expect_delay)
284 {
285         u32 delay = expect_delay;
286         bool gr_enabled;
287         bool ctxsw_active;
288         bool gr_busy;
289
290         gk20a_dbg_fn("");
291
292         do {
293                 /* fmodel: host gets fifo_engine_status(gr) from gr
294                    only when gr_status is read */
295                 gk20a_readl(g, gr_status_r());
296
297                 gr_enabled = gk20a_readl(g, mc_enable_r()) &
298                         mc_enable_pgraph_enabled_f();
299
300                 ctxsw_active = gk20a_readl(g,
301                         fifo_engine_status_r(ENGINE_GR_GK20A)) &
302                         fifo_engine_status_ctxsw_in_progress_f();
303
304                 gr_busy = gk20a_readl(g, gr_engine_status_r()) &
305                         gr_engine_status_value_busy_f();
306
307                 if (!gr_enabled || (!gr_busy && !ctxsw_active)) {
308                         gk20a_dbg_fn("done");
309                         return 0;
310                 }
311
312                 usleep_range(delay, delay * 2);
313                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
314
315         } while (time_before(jiffies, end_jiffies)
316                         || !tegra_platform_is_silicon());
317
318         gk20a_err(dev_from_gk20a(g),
319                 "timeout, ctxsw busy : %d, gr busy : %d",
320                 ctxsw_active, gr_busy);
321
322         return -EAGAIN;
323 }
324
325 static int gr_gk20a_wait_fe_idle(struct gk20a *g, unsigned long end_jiffies,
326                 u32 expect_delay)
327 {
328         u32 val;
329         u32 delay = expect_delay;
330
331         gk20a_dbg_fn("");
332
333         do {
334                 val = gk20a_readl(g, gr_status_r());
335
336                 if (!gr_status_fe_method_upper_v(val) &&
337                         !gr_status_fe_method_lower_v(val) &&
338                         !gr_status_fe_method_fe_gi_v(val)) {
339                         gk20a_dbg_fn("done");
340                         return 0;
341                 }
342
343                 usleep_range(delay, delay * 2);
344                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
345         } while (time_before(jiffies, end_jiffies)
346                         || !tegra_platform_is_silicon());
347
348         gk20a_err(dev_from_gk20a(g),
349                 "timeout, fe busy : %x", val);
350
351         return -EAGAIN;
352 }
353 static int gr_gk20a_ctx_reset(struct gk20a *g, u32 rst_mask)
354 {
355         u32 delay = GR_IDLE_CHECK_DEFAULT;
356         unsigned long end_jiffies = jiffies +
357                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
358         u32 reg;
359
360         gk20a_dbg_fn("");
361
362         if (!tegra_platform_is_linsim()) {
363                 /* Force clocks on */
364                 gk20a_writel(g, gr_fe_pwr_mode_r(),
365                              gr_fe_pwr_mode_req_send_f() |
366                              gr_fe_pwr_mode_mode_force_on_f());
367
368                 /* Wait for the clocks to indicate that they are on */
369                 do {
370                         reg = gk20a_readl(g, gr_fe_pwr_mode_r());
371
372                         if (gr_fe_pwr_mode_req_v(reg) ==
373                                         gr_fe_pwr_mode_req_done_v())
374                                 break;
375
376                         usleep_range(delay, delay * 2);
377                         delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
378
379                 } while (time_before(jiffies, end_jiffies));
380
381                 if (!time_before(jiffies, end_jiffies)) {
382                         gk20a_err(dev_from_gk20a(g),
383                                    "failed to force the clocks on\n");
384                         WARN_ON(1);
385                 }
386         }
387         if (rst_mask) {
388                 gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(), rst_mask);
389         } else {
390                 gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
391                              gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
392                              gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
393                              gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f()  |
394                              gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
395                              gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
396                              gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f()  |
397                              gr_fecs_ctxsw_reset_ctl_sys_context_reset_enabled_f() |
398                              gr_fecs_ctxsw_reset_ctl_gpc_context_reset_enabled_f() |
399                              gr_fecs_ctxsw_reset_ctl_be_context_reset_enabled_f());
400         }
401
402         /* we need to read the reset register *and* wait for a moment to ensure
403          * reset propagation */
404
405         gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
406         udelay(20);
407
408         gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
409                      gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
410                      gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
411                      gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f()  |
412                      gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
413                      gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
414                      gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f()  |
415                      gr_fecs_ctxsw_reset_ctl_sys_context_reset_disabled_f() |
416                      gr_fecs_ctxsw_reset_ctl_gpc_context_reset_disabled_f() |
417                      gr_fecs_ctxsw_reset_ctl_be_context_reset_disabled_f());
418
419         /* we need to readl the reset and then wait a small moment after that */
420         gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
421         udelay(20);
422
423         if (!tegra_platform_is_linsim()) {
424                 /* Set power mode back to auto */
425                 gk20a_writel(g, gr_fe_pwr_mode_r(),
426                              gr_fe_pwr_mode_req_send_f() |
427                              gr_fe_pwr_mode_mode_auto_f());
428
429                 /* Wait for the request to complete */
430                 end_jiffies = jiffies +
431                         msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
432                 do {
433                         reg = gk20a_readl(g, gr_fe_pwr_mode_r());
434
435                         if (gr_fe_pwr_mode_req_v(reg) ==
436                                         gr_fe_pwr_mode_req_done_v())
437                                 break;
438
439                         usleep_range(delay, delay * 2);
440                         delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
441
442                 } while (time_before(jiffies, end_jiffies));
443
444                 if (!time_before(jiffies, end_jiffies))
445                         gk20a_warn(dev_from_gk20a(g),
446                                    "failed to set power mode to auto\n");
447         }
448
449         return 0;
450 }
451
452 static int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id,
453                                    u32 *mailbox_ret, u32 opc_success,
454                                    u32 mailbox_ok, u32 opc_fail,
455                                    u32 mailbox_fail)
456 {
457         unsigned long end_jiffies = jiffies +
458                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
459         u32 delay = GR_IDLE_CHECK_DEFAULT;
460         u32 check = WAIT_UCODE_LOOP;
461         u32 reg;
462
463         gk20a_dbg_fn("");
464
465         while (check == WAIT_UCODE_LOOP) {
466                 if (!time_before(jiffies, end_jiffies) &&
467                                 tegra_platform_is_silicon())
468                         check = WAIT_UCODE_TIMEOUT;
469
470                 reg = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(mailbox_id));
471
472                 if (mailbox_ret)
473                         *mailbox_ret = reg;
474
475                 switch (opc_success) {
476                 case GR_IS_UCODE_OP_EQUAL:
477                         if (reg == mailbox_ok)
478                                 check = WAIT_UCODE_OK;
479                         break;
480                 case GR_IS_UCODE_OP_NOT_EQUAL:
481                         if (reg != mailbox_ok)
482                                 check = WAIT_UCODE_OK;
483                         break;
484                 case GR_IS_UCODE_OP_AND:
485                         if (reg & mailbox_ok)
486                                 check = WAIT_UCODE_OK;
487                         break;
488                 case GR_IS_UCODE_OP_LESSER:
489                         if (reg < mailbox_ok)
490                                 check = WAIT_UCODE_OK;
491                         break;
492                 case GR_IS_UCODE_OP_LESSER_EQUAL:
493                         if (reg <= mailbox_ok)
494                                 check = WAIT_UCODE_OK;
495                         break;
496                 case GR_IS_UCODE_OP_SKIP:
497                         /* do no success check */
498                         break;
499                 default:
500                         gk20a_err(dev_from_gk20a(g),
501                                    "invalid success opcode 0x%x", opc_success);
502
503                         check = WAIT_UCODE_ERROR;
504                         break;
505                 }
506
507                 switch (opc_fail) {
508                 case GR_IS_UCODE_OP_EQUAL:
509                         if (reg == mailbox_fail)
510                                 check = WAIT_UCODE_ERROR;
511                         break;
512                 case GR_IS_UCODE_OP_NOT_EQUAL:
513                         if (reg != mailbox_fail)
514                                 check = WAIT_UCODE_ERROR;
515                         break;
516                 case GR_IS_UCODE_OP_AND:
517                         if (reg & mailbox_fail)
518                                 check = WAIT_UCODE_ERROR;
519                         break;
520                 case GR_IS_UCODE_OP_LESSER:
521                         if (reg < mailbox_fail)
522                                 check = WAIT_UCODE_ERROR;
523                         break;
524                 case GR_IS_UCODE_OP_LESSER_EQUAL:
525                         if (reg <= mailbox_fail)
526                                 check = WAIT_UCODE_ERROR;
527                         break;
528                 case GR_IS_UCODE_OP_SKIP:
529                         /* do no check on fail*/
530                         break;
531                 default:
532                         gk20a_err(dev_from_gk20a(g),
533                                    "invalid fail opcode 0x%x", opc_fail);
534                         check = WAIT_UCODE_ERROR;
535                         break;
536                 }
537
538                 usleep_range(delay, delay * 2);
539                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
540         }
541
542         if (check == WAIT_UCODE_TIMEOUT) {
543                 gk20a_err(dev_from_gk20a(g),
544                            "timeout waiting on ucode response");
545                 gk20a_fecs_dump_falcon_stats(g);
546                 return -1;
547         } else if (check == WAIT_UCODE_ERROR) {
548                 gk20a_err(dev_from_gk20a(g),
549                            "ucode method failed on mailbox=%d value=0x%08x",
550                            mailbox_id, reg);
551                 gk20a_fecs_dump_falcon_stats(g);
552                 return -1;
553         }
554
555         gk20a_dbg_fn("done");
556         return 0;
557 }
558
559 /* The following is a less brittle way to call gr_gk20a_submit_fecs_method(...)
560  * We should replace most, if not all, fecs method calls to this instead. */
561 struct fecs_method_op_gk20a {
562         struct {
563                 u32 addr;
564                 u32 data;
565         } method;
566
567         struct {
568                 u32 id;
569                 u32 data;
570                 u32 clr;
571                 u32 *ret;
572                 u32 ok;
573                 u32 fail;
574         } mailbox;
575
576         struct {
577                 u32 ok;
578                 u32 fail;
579         } cond;
580
581 };
582
583 int gr_gk20a_submit_fecs_method_op(struct gk20a *g,
584                                    struct fecs_method_op_gk20a op)
585 {
586         struct gr_gk20a *gr = &g->gr;
587         int ret;
588
589         mutex_lock(&gr->fecs_mutex);
590
591         if (op.mailbox.id != 0)
592                 gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(op.mailbox.id),
593                              op.mailbox.data);
594
595         gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
596                 gr_fecs_ctxsw_mailbox_clear_value_f(op.mailbox.clr));
597
598         gk20a_writel(g, gr_fecs_method_data_r(), op.method.data);
599         gk20a_writel(g, gr_fecs_method_push_r(),
600                 gr_fecs_method_push_adr_f(op.method.addr));
601
602         /* op.mb.id == 4 cases require waiting for completion on
603          * for op.mb.id == 0 */
604         if (op.mailbox.id == 4)
605                 op.mailbox.id = 0;
606
607         ret = gr_gk20a_ctx_wait_ucode(g, op.mailbox.id, op.mailbox.ret,
608                                       op.cond.ok, op.mailbox.ok,
609                                       op.cond.fail, op.mailbox.fail);
610
611         mutex_unlock(&gr->fecs_mutex);
612
613         return ret;
614 }
615
616 int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret)
617 {
618         return gr_gk20a_submit_fecs_method_op(g,
619               (struct fecs_method_op_gk20a) {
620                       .method.addr = fecs_method,
621                       .method.data = ~0,
622                       .mailbox = { .id   = 1, /*sideband?*/
623                                    .data = ~0, .clr = ~0, .ret = ret,
624                                    .ok   = gr_fecs_ctxsw_mailbox_value_pass_v(),
625                                    .fail = gr_fecs_ctxsw_mailbox_value_fail_v(), },
626                       .cond.ok = GR_IS_UCODE_OP_EQUAL,
627                       .cond.fail = GR_IS_UCODE_OP_EQUAL });
628 }
629
630 /* Stop processing (stall) context switches at FECS.
631  * The caller must hold the dbg_sessions_lock, else if mutliple stop methods
632  * are sent to the ucode in sequence, it can get into an undefined state. */
633 int gr_gk20a_disable_ctxsw(struct gk20a *g)
634 {
635         gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
636         return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_stop_ctxsw_v(), 0);
637 }
638
639 /* Start processing (continue) context switches at FECS */
640 int gr_gk20a_enable_ctxsw(struct gk20a *g)
641 {
642         gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
643         return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_start_ctxsw_v(), 0);
644 }
645
646
647 static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
648 {
649         u32 addr_lo;
650         u32 addr_hi;
651         void *inst_ptr = NULL;
652
653         gk20a_dbg_fn("");
654
655         inst_ptr = c->inst_block.cpuva;
656         if (!inst_ptr)
657                 return -ENOMEM;
658
659         addr_lo = u64_lo32(gpu_va) >> 12;
660         addr_hi = u64_hi32(gpu_va);
661
662         gk20a_mem_wr32(inst_ptr, ram_in_gr_wfi_target_w(),
663                  ram_in_gr_cs_wfi_f() | ram_in_gr_wfi_mode_virtual_f() |
664                  ram_in_gr_wfi_ptr_lo_f(addr_lo));
665
666         gk20a_mem_wr32(inst_ptr, ram_in_gr_wfi_ptr_hi_w(),
667                  ram_in_gr_wfi_ptr_hi_f(addr_hi));
668
669         return 0;
670 }
671
672 /*
673  * Context state can be written directly or "patched" at times.
674  * So that code can be used in either situation it is written
675  * using a series _ctx_patch_write(..., patch) statements.
676  * However any necessary cpu map/unmap and gpu l2 invalidates
677  * should be minimized (to avoid doing it once per patch write).
678  * Before a sequence of these set up with "_ctx_patch_write_begin"
679  * and close with "_ctx_patch_write_end."
680  */
681 int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
682                                           struct channel_ctx_gk20a *ch_ctx)
683 {
684         /* being defensive still... */
685         if (ch_ctx->patch_ctx.cpu_va) {
686                 gk20a_err(dev_from_gk20a(g), "nested ctx patch begin?");
687                 return -EBUSY;
688         }
689
690         ch_ctx->patch_ctx.cpu_va = vmap(ch_ctx->patch_ctx.pages,
691                         PAGE_ALIGN(ch_ctx->patch_ctx.size) >> PAGE_SHIFT,
692                         0, pgprot_dmacoherent(PAGE_KERNEL));
693
694         if (!ch_ctx->patch_ctx.cpu_va)
695                 return -ENOMEM;
696
697         return 0;
698 }
699
700 int gr_gk20a_ctx_patch_write_end(struct gk20a *g,
701                                         struct channel_ctx_gk20a *ch_ctx)
702 {
703         /* being defensive still... */
704         if (!ch_ctx->patch_ctx.cpu_va) {
705                 gk20a_err(dev_from_gk20a(g), "dangling ctx patch end?");
706                 return -EINVAL;
707         }
708
709         vunmap(ch_ctx->patch_ctx.cpu_va);
710         ch_ctx->patch_ctx.cpu_va = NULL;
711         return 0;
712 }
713
714 int gr_gk20a_ctx_patch_write(struct gk20a *g,
715                                     struct channel_ctx_gk20a *ch_ctx,
716                                     u32 addr, u32 data, bool patch)
717 {
718         u32 patch_slot = 0;
719         void *patch_ptr = NULL;
720         bool mapped_here = false;
721
722         BUG_ON(patch != 0 && ch_ctx == NULL);
723
724         if (patch) {
725                 if (!ch_ctx)
726                         return -EINVAL;
727                 /* we added an optimization prolog, epilog
728                  * to get rid of unnecessary maps and l2 invals.
729                  * but be defensive still... */
730                 if (!ch_ctx->patch_ctx.cpu_va) {
731                         int err;
732                         gk20a_err(dev_from_gk20a(g),
733                                    "per-write ctx patch begin?");
734                         /* yes, gr_gk20a_ctx_patch_smpc causes this one */
735                         err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
736                         if (err)
737                                 return err;
738                         mapped_here = true;
739                 } else
740                         mapped_here = false;
741
742                 patch_ptr = ch_ctx->patch_ctx.cpu_va;
743                 patch_slot = ch_ctx->patch_ctx.data_count * 2;
744
745                 gk20a_mem_wr32(patch_ptr, patch_slot++, addr);
746                 gk20a_mem_wr32(patch_ptr, patch_slot++, data);
747
748                 ch_ctx->patch_ctx.data_count++;
749
750                 if (mapped_here)
751                         gr_gk20a_ctx_patch_write_end(g, ch_ctx);
752
753         } else
754                 gk20a_writel(g, addr, data);
755
756         return 0;
757 }
758
759 static int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
760                                         struct channel_gk20a *c)
761 {
762         u32 inst_base_ptr = u64_lo32(c->inst_block.cpu_pa
763                                      >> ram_in_base_shift_v());
764         u32 ret;
765
766         gk20a_dbg_info("bind channel %d inst ptr 0x%08x",
767                    c->hw_chid, inst_base_ptr);
768
769         ret = gr_gk20a_submit_fecs_method_op(g,
770                      (struct fecs_method_op_gk20a) {
771                      .method.addr = gr_fecs_method_push_adr_bind_pointer_v(),
772                      .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
773                                      gr_fecs_current_ctx_target_vid_mem_f() |
774                                      gr_fecs_current_ctx_valid_f(1)),
775                      .mailbox = { .id = 0, .data = 0,
776                                   .clr = 0x30,
777                                   .ret = NULL,
778                                   .ok = 0x10,
779                                   .fail = 0x20, },
780                      .cond.ok = GR_IS_UCODE_OP_AND,
781                      .cond.fail = GR_IS_UCODE_OP_AND});
782         if (ret)
783                 gk20a_err(dev_from_gk20a(g),
784                         "bind channel instance failed");
785
786         return ret;
787 }
788
789 static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c,
790                                     bool disable_fifo)
791 {
792         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
793         struct fifo_gk20a *f = &g->fifo;
794         struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
795         u32 va_lo, va_hi, va;
796         int ret = 0;
797         void *ctx_ptr = NULL;
798
799         gk20a_dbg_fn("");
800
801         ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
802                         PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
803                         0, pgprot_dmacoherent(PAGE_KERNEL));
804         if (!ctx_ptr)
805                 return -ENOMEM;
806
807         if (ch_ctx->zcull_ctx.gpu_va == 0 &&
808             ch_ctx->zcull_ctx.ctx_sw_mode ==
809                 ctxsw_prog_main_image_zcull_mode_separate_buffer_v()) {
810                 ret = -EINVAL;
811                 goto clean_up;
812         }
813
814         va_lo = u64_lo32(ch_ctx->zcull_ctx.gpu_va);
815         va_hi = u64_hi32(ch_ctx->zcull_ctx.gpu_va);
816         va = ((va_lo >> 8) & 0x00FFFFFF) | ((va_hi << 24) & 0xFF000000);
817
818         if (disable_fifo) {
819                 ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
820                 if (ret) {
821                         gk20a_err(dev_from_gk20a(g),
822                                 "failed to disable gr engine activity\n");
823                         goto clean_up;
824                 }
825         }
826
827         gk20a_mm_fb_flush(g);
828
829         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_o(), 0,
830                  ch_ctx->zcull_ctx.ctx_sw_mode);
831
832         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, va);
833
834         if (disable_fifo) {
835                 ret = gk20a_fifo_enable_engine_activity(g, gr_info);
836                 if (ret) {
837                         gk20a_err(dev_from_gk20a(g),
838                                 "failed to enable gr engine activity\n");
839                         goto clean_up;
840                 }
841         }
842
843 clean_up:
844         vunmap(ctx_ptr);
845
846         return ret;
847 }
848
849 static int gr_gk20a_commit_global_cb_manager(struct gk20a *g,
850                         struct channel_gk20a *c, bool patch)
851 {
852         struct gr_gk20a *gr = &g->gr;
853         struct channel_ctx_gk20a *ch_ctx = NULL;
854         u32 attrib_offset_in_chunk = 0;
855         u32 alpha_offset_in_chunk = 0;
856         u32 pd_ab_max_output;
857         u32 gpc_index, ppc_index;
858         u32 temp;
859         u32 cbm_cfg_size1, cbm_cfg_size2;
860
861         gk20a_dbg_fn("");
862
863         if (patch) {
864                 int err;
865                 ch_ctx = &c->ch_ctx;
866                 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
867                 if (err)
868                         return err;
869         }
870
871         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_tga_constraintlogic_r(),
872                 gr_ds_tga_constraintlogic_beta_cbsize_f(gr->attrib_cb_default_size) |
873                 gr_ds_tga_constraintlogic_alpha_cbsize_f(gr->alpha_cb_default_size),
874                 patch);
875
876         pd_ab_max_output = (gr->alpha_cb_default_size *
877                 gr_gpc0_ppc0_cbm_cfg_size_granularity_v()) /
878                 gr_pd_ab_dist_cfg1_max_output_granularity_v();
879
880         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg1_r(),
881                 gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output) |
882                 gr_pd_ab_dist_cfg1_max_batches_init_f(), patch);
883
884         alpha_offset_in_chunk = attrib_offset_in_chunk +
885                 gr->tpc_count * gr->attrib_cb_size;
886
887         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
888                 temp = proj_gpc_stride_v() * gpc_index;
889                 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
890                      ppc_index++) {
891                         cbm_cfg_size1 = gr->attrib_cb_default_size *
892                                 gr->pes_tpc_count[ppc_index][gpc_index];
893                         cbm_cfg_size2 = gr->alpha_cb_default_size *
894                                 gr->pes_tpc_count[ppc_index][gpc_index];
895
896                         gr_gk20a_ctx_patch_write(g, ch_ctx,
897                                 gr_gpc0_ppc0_cbm_cfg_r() + temp +
898                                 proj_ppc_in_gpc_stride_v() * ppc_index,
899                                 gr_gpc0_ppc0_cbm_cfg_timeslice_mode_f(gr->timeslice_mode) |
900                                 gr_gpc0_ppc0_cbm_cfg_start_offset_f(attrib_offset_in_chunk) |
901                                 gr_gpc0_ppc0_cbm_cfg_size_f(cbm_cfg_size1), patch);
902
903                         attrib_offset_in_chunk += gr->attrib_cb_size *
904                                 gr->pes_tpc_count[ppc_index][gpc_index];
905
906                         gr_gk20a_ctx_patch_write(g, ch_ctx,
907                                 gr_gpc0_ppc0_cbm_cfg2_r() + temp +
908                                 proj_ppc_in_gpc_stride_v() * ppc_index,
909                                 gr_gpc0_ppc0_cbm_cfg2_start_offset_f(alpha_offset_in_chunk) |
910                                 gr_gpc0_ppc0_cbm_cfg2_size_f(cbm_cfg_size2), patch);
911
912                         alpha_offset_in_chunk += gr->alpha_cb_size *
913                                 gr->pes_tpc_count[ppc_index][gpc_index];
914                 }
915         }
916
917         if (patch)
918                 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
919
920         return 0;
921 }
922
923 static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
924                         struct channel_gk20a *c, bool patch)
925 {
926         struct gr_gk20a *gr = &g->gr;
927         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
928         u64 addr;
929         u32 size;
930
931         gk20a_dbg_fn("");
932         if (patch) {
933                 int err;
934                 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
935                 if (err)
936                         return err;
937         }
938
939         /* global pagepool buffer */
940         addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) >>
941                 gr_scc_pagepool_base_addr_39_8_align_bits_v()) |
942                 (u64_hi32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) <<
943                  (32 - gr_scc_pagepool_base_addr_39_8_align_bits_v()));
944
945         size = gr->global_ctx_buffer[PAGEPOOL].size /
946                 gr_scc_pagepool_total_pages_byte_granularity_v();
947
948         if (size == gr_scc_pagepool_total_pages_hwmax_value_v())
949                 size = gr_scc_pagepool_total_pages_hwmax_v();
950
951         gk20a_dbg_info("pagepool buffer addr : 0x%016llx, size : %d",
952                 addr, size);
953
954         g->ops.gr.commit_global_pagepool(g, ch_ctx, addr, size, patch);
955
956         /* global bundle cb */
957         addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) >>
958                 gr_scc_bundle_cb_base_addr_39_8_align_bits_v()) |
959                 (u64_hi32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) <<
960                  (32 - gr_scc_bundle_cb_base_addr_39_8_align_bits_v()));
961
962         size = gr->bundle_cb_default_size;
963
964         gk20a_dbg_info("bundle cb addr : 0x%016llx, size : %d",
965                 addr, size);
966
967         g->ops.gr.commit_global_bundle_cb(g, ch_ctx, addr, size, patch);
968
969         /* global attrib cb */
970         addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) >>
971                 gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()) |
972                 (u64_hi32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) <<
973                  (32 - gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()));
974
975         gk20a_dbg_info("attrib cb addr : 0x%016llx", addr);
976         g->ops.gr.commit_global_attrib_cb(g, ch_ctx, addr, patch);
977
978         if (patch)
979                 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
980
981         return 0;
982 }
983
984 static void gr_gk20a_commit_global_attrib_cb(struct gk20a *g,
985                                             struct channel_ctx_gk20a *ch_ctx,
986                                             u64 addr, bool patch)
987 {
988         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_attrib_cb_base_r(),
989                 gr_gpcs_setup_attrib_cb_base_addr_39_12_f(addr) |
990                 gr_gpcs_setup_attrib_cb_base_valid_true_f(), patch);
991
992         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_pin_cb_global_base_addr_r(),
993                 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_v_f(addr) |
994                 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_valid_true_f(), patch);
995 }
996
997 static void gr_gk20a_commit_global_bundle_cb(struct gk20a *g,
998                                             struct channel_ctx_gk20a *ch_ctx,
999                                             u64 addr, u64 size, bool patch)
1000 {
1001         u32 data;
1002
1003         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_base_r(),
1004                 gr_scc_bundle_cb_base_addr_39_8_f(addr), patch);
1005
1006         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_size_r(),
1007                 gr_scc_bundle_cb_size_div_256b_f(size) |
1008                 gr_scc_bundle_cb_size_valid_true_f(), patch);
1009
1010         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_base_r(),
1011                 gr_gpcs_setup_bundle_cb_base_addr_39_8_f(addr), patch);
1012
1013         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_size_r(),
1014                 gr_gpcs_setup_bundle_cb_size_div_256b_f(size) |
1015                 gr_gpcs_setup_bundle_cb_size_valid_true_f(), patch);
1016
1017         /* data for state_limit */
1018         data = (g->gr.bundle_cb_default_size *
1019                 gr_scc_bundle_cb_size_div_256b_byte_granularity_v()) /
1020                 gr_pd_ab_dist_cfg2_state_limit_scc_bundle_granularity_v();
1021
1022         data = min_t(u32, data, g->gr.min_gpm_fifo_depth);
1023
1024         gk20a_dbg_info("bundle cb token limit : %d, state limit : %d",
1025                    g->gr.bundle_cb_token_limit, data);
1026
1027         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg2_r(),
1028                 gr_pd_ab_dist_cfg2_token_limit_f(g->gr.bundle_cb_token_limit) |
1029                 gr_pd_ab_dist_cfg2_state_limit_f(data), patch);
1030
1031 }
1032
1033 static int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c, bool patch)
1034 {
1035         struct gr_gk20a *gr = &g->gr;
1036         struct channel_ctx_gk20a *ch_ctx = NULL;
1037         u32 gpm_pd_cfg;
1038         u32 pd_ab_dist_cfg0;
1039         u32 ds_debug;
1040         u32 mpc_vtg_debug;
1041         u32 pe_vaf;
1042         u32 pe_vsc_vpc;
1043
1044         gk20a_dbg_fn("");
1045
1046         gpm_pd_cfg = gk20a_readl(g, gr_gpcs_gpm_pd_cfg_r());
1047         pd_ab_dist_cfg0 = gk20a_readl(g, gr_pd_ab_dist_cfg0_r());
1048         ds_debug = gk20a_readl(g, gr_ds_debug_r());
1049         mpc_vtg_debug = gk20a_readl(g, gr_gpcs_tpcs_mpc_vtg_debug_r());
1050
1051         if (patch) {
1052                 int err;
1053                 ch_ctx = &c->ch_ctx;
1054                 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
1055                 if (err)
1056                         return err;
1057         }
1058
1059         if (gr->timeslice_mode == gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v()) {
1060                 pe_vaf = gk20a_readl(g, gr_gpcs_tpcs_pe_vaf_r());
1061                 pe_vsc_vpc = gk20a_readl(g, gr_gpcs_tpcs_pes_vsc_vpc_r());
1062
1063                 gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_enable_f() | gpm_pd_cfg;
1064                 pe_vaf = gr_gpcs_tpcs_pe_vaf_fast_mode_switch_true_f() | pe_vaf;
1065                 pe_vsc_vpc = gr_gpcs_tpcs_pes_vsc_vpc_fast_mode_switch_true_f() | pe_vsc_vpc;
1066                 pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_en_f() | pd_ab_dist_cfg0;
1067                 ds_debug = gr_ds_debug_timeslice_mode_enable_f() | ds_debug;
1068                 mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_enabled_f() | mpc_vtg_debug;
1069
1070                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
1071                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_vaf_r(), pe_vaf, patch);
1072                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pes_vsc_vpc_r(), pe_vsc_vpc, patch);
1073                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
1074                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
1075                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
1076         } else {
1077                 gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_disable_f() | gpm_pd_cfg;
1078                 pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_dis_f() | pd_ab_dist_cfg0;
1079                 ds_debug = gr_ds_debug_timeslice_mode_disable_f() | ds_debug;
1080                 mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_disabled_f() | mpc_vtg_debug;
1081
1082                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
1083                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
1084                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
1085                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
1086         }
1087
1088         if (patch)
1089                 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
1090
1091         return 0;
1092 }
1093
1094 int gr_gk20a_setup_rop_mapping(struct gk20a *g, struct gr_gk20a *gr)
1095 {
1096         u32 norm_entries, norm_shift;
1097         u32 coeff5_mod, coeff6_mod, coeff7_mod, coeff8_mod, coeff9_mod, coeff10_mod, coeff11_mod;
1098         u32 map0, map1, map2, map3, map4, map5;
1099
1100         if (!gr->map_tiles)
1101                 return -1;
1102
1103         gk20a_dbg_fn("");
1104
1105         gk20a_writel(g, gr_crstr_map_table_cfg_r(),
1106                      gr_crstr_map_table_cfg_row_offset_f(gr->map_row_offset) |
1107                      gr_crstr_map_table_cfg_num_entries_f(gr->tpc_count));
1108
1109         map0 =  gr_crstr_gpc_map0_tile0_f(gr->map_tiles[0]) |
1110                 gr_crstr_gpc_map0_tile1_f(gr->map_tiles[1]) |
1111                 gr_crstr_gpc_map0_tile2_f(gr->map_tiles[2]) |
1112                 gr_crstr_gpc_map0_tile3_f(gr->map_tiles[3]) |
1113                 gr_crstr_gpc_map0_tile4_f(gr->map_tiles[4]) |
1114                 gr_crstr_gpc_map0_tile5_f(gr->map_tiles[5]);
1115
1116         map1 =  gr_crstr_gpc_map1_tile6_f(gr->map_tiles[6]) |
1117                 gr_crstr_gpc_map1_tile7_f(gr->map_tiles[7]) |
1118                 gr_crstr_gpc_map1_tile8_f(gr->map_tiles[8]) |
1119                 gr_crstr_gpc_map1_tile9_f(gr->map_tiles[9]) |
1120                 gr_crstr_gpc_map1_tile10_f(gr->map_tiles[10]) |
1121                 gr_crstr_gpc_map1_tile11_f(gr->map_tiles[11]);
1122
1123         map2 =  gr_crstr_gpc_map2_tile12_f(gr->map_tiles[12]) |
1124                 gr_crstr_gpc_map2_tile13_f(gr->map_tiles[13]) |
1125                 gr_crstr_gpc_map2_tile14_f(gr->map_tiles[14]) |
1126                 gr_crstr_gpc_map2_tile15_f(gr->map_tiles[15]) |
1127                 gr_crstr_gpc_map2_tile16_f(gr->map_tiles[16]) |
1128                 gr_crstr_gpc_map2_tile17_f(gr->map_tiles[17]);
1129
1130         map3 =  gr_crstr_gpc_map3_tile18_f(gr->map_tiles[18]) |
1131                 gr_crstr_gpc_map3_tile19_f(gr->map_tiles[19]) |
1132                 gr_crstr_gpc_map3_tile20_f(gr->map_tiles[20]) |
1133                 gr_crstr_gpc_map3_tile21_f(gr->map_tiles[21]) |
1134                 gr_crstr_gpc_map3_tile22_f(gr->map_tiles[22]) |
1135                 gr_crstr_gpc_map3_tile23_f(gr->map_tiles[23]);
1136
1137         map4 =  gr_crstr_gpc_map4_tile24_f(gr->map_tiles[24]) |
1138                 gr_crstr_gpc_map4_tile25_f(gr->map_tiles[25]) |
1139                 gr_crstr_gpc_map4_tile26_f(gr->map_tiles[26]) |
1140                 gr_crstr_gpc_map4_tile27_f(gr->map_tiles[27]) |
1141                 gr_crstr_gpc_map4_tile28_f(gr->map_tiles[28]) |
1142                 gr_crstr_gpc_map4_tile29_f(gr->map_tiles[29]);
1143
1144         map5 =  gr_crstr_gpc_map5_tile30_f(gr->map_tiles[30]) |
1145                 gr_crstr_gpc_map5_tile31_f(gr->map_tiles[31]) |
1146                 gr_crstr_gpc_map5_tile32_f(0) |
1147                 gr_crstr_gpc_map5_tile33_f(0) |
1148                 gr_crstr_gpc_map5_tile34_f(0) |
1149                 gr_crstr_gpc_map5_tile35_f(0);
1150
1151         gk20a_writel(g, gr_crstr_gpc_map0_r(), map0);
1152         gk20a_writel(g, gr_crstr_gpc_map1_r(), map1);
1153         gk20a_writel(g, gr_crstr_gpc_map2_r(), map2);
1154         gk20a_writel(g, gr_crstr_gpc_map3_r(), map3);
1155         gk20a_writel(g, gr_crstr_gpc_map4_r(), map4);
1156         gk20a_writel(g, gr_crstr_gpc_map5_r(), map5);
1157
1158         switch (gr->tpc_count) {
1159         case 1:
1160                 norm_shift = 4;
1161                 break;
1162         case 2:
1163         case 3:
1164                 norm_shift = 3;
1165                 break;
1166         case 4:
1167         case 5:
1168         case 6:
1169         case 7:
1170                 norm_shift = 2;
1171                 break;
1172         case 8:
1173         case 9:
1174         case 10:
1175         case 11:
1176         case 12:
1177         case 13:
1178         case 14:
1179         case 15:
1180                 norm_shift = 1;
1181                 break;
1182         default:
1183                 norm_shift = 0;
1184                 break;
1185         }
1186
1187         norm_entries = gr->tpc_count << norm_shift;
1188         coeff5_mod = (1 << 5) % norm_entries;
1189         coeff6_mod = (1 << 6) % norm_entries;
1190         coeff7_mod = (1 << 7) % norm_entries;
1191         coeff8_mod = (1 << 8) % norm_entries;
1192         coeff9_mod = (1 << 9) % norm_entries;
1193         coeff10_mod = (1 << 10) % norm_entries;
1194         coeff11_mod = (1 << 11) % norm_entries;
1195
1196         gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg_r(),
1197                      gr_ppcs_wwdx_map_table_cfg_row_offset_f(gr->map_row_offset) |
1198                      gr_ppcs_wwdx_map_table_cfg_normalized_num_entries_f(norm_entries) |
1199                      gr_ppcs_wwdx_map_table_cfg_normalized_shift_value_f(norm_shift) |
1200                      gr_ppcs_wwdx_map_table_cfg_coeff5_mod_value_f(coeff5_mod) |
1201                      gr_ppcs_wwdx_map_table_cfg_num_entries_f(gr->tpc_count));
1202
1203         gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg2_r(),
1204                      gr_ppcs_wwdx_map_table_cfg2_coeff6_mod_value_f(coeff6_mod) |
1205                      gr_ppcs_wwdx_map_table_cfg2_coeff7_mod_value_f(coeff7_mod) |
1206                      gr_ppcs_wwdx_map_table_cfg2_coeff8_mod_value_f(coeff8_mod) |
1207                      gr_ppcs_wwdx_map_table_cfg2_coeff9_mod_value_f(coeff9_mod) |
1208                      gr_ppcs_wwdx_map_table_cfg2_coeff10_mod_value_f(coeff10_mod) |
1209                      gr_ppcs_wwdx_map_table_cfg2_coeff11_mod_value_f(coeff11_mod));
1210
1211         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map0_r(), map0);
1212         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map1_r(), map1);
1213         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map2_r(), map2);
1214         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map3_r(), map3);
1215         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map4_r(), map4);
1216         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map5_r(), map5);
1217
1218         gk20a_writel(g, gr_rstr2d_map_table_cfg_r(),
1219                      gr_rstr2d_map_table_cfg_row_offset_f(gr->map_row_offset) |
1220                      gr_rstr2d_map_table_cfg_num_entries_f(gr->tpc_count));
1221
1222         gk20a_writel(g, gr_rstr2d_gpc_map0_r(), map0);
1223         gk20a_writel(g, gr_rstr2d_gpc_map1_r(), map1);
1224         gk20a_writel(g, gr_rstr2d_gpc_map2_r(), map2);
1225         gk20a_writel(g, gr_rstr2d_gpc_map3_r(), map3);
1226         gk20a_writel(g, gr_rstr2d_gpc_map4_r(), map4);
1227         gk20a_writel(g, gr_rstr2d_gpc_map5_r(), map5);
1228
1229         return 0;
1230 }
1231
1232 static inline u32 count_bits(u32 mask)
1233 {
1234         u32 temp = mask;
1235         u32 count;
1236         for (count = 0; temp != 0; count++)
1237                 temp &= temp - 1;
1238
1239         return count;
1240 }
1241
1242 static inline u32 clear_count_bits(u32 num, u32 clear_count)
1243 {
1244         u32 count = clear_count;
1245         for (; (num != 0) && (count != 0); count--)
1246                 num &= num - 1;
1247
1248         return num;
1249 }
1250
1251 static int gr_gk20a_setup_alpha_beta_tables(struct gk20a *g,
1252                                         struct gr_gk20a *gr)
1253 {
1254         u32 table_index_bits = 5;
1255         u32 rows = (1 << table_index_bits);
1256         u32 row_stride = gr_pd_alpha_ratio_table__size_1_v() / rows;
1257
1258         u32 row;
1259         u32 index;
1260         u32 gpc_index;
1261         u32 gpcs_per_reg = 4;
1262         u32 pes_index;
1263         u32 tpc_count_pes;
1264         u32 num_pes_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
1265
1266         u32 alpha_target, beta_target;
1267         u32 alpha_bits, beta_bits;
1268         u32 alpha_mask, beta_mask, partial_mask;
1269         u32 reg_offset;
1270         bool assign_alpha;
1271
1272         u32 map_alpha[gr_pd_alpha_ratio_table__size_1_v()];
1273         u32 map_beta[gr_pd_alpha_ratio_table__size_1_v()];
1274         u32 map_reg_used[gr_pd_alpha_ratio_table__size_1_v()];
1275
1276         gk20a_dbg_fn("");
1277
1278         memset(map_alpha, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1279         memset(map_beta, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1280         memset(map_reg_used, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1281
1282         for (row = 0; row < rows; ++row) {
1283                 alpha_target = max_t(u32, gr->tpc_count * row / rows, 1);
1284                 beta_target = gr->tpc_count - alpha_target;
1285
1286                 assign_alpha = (alpha_target < beta_target);
1287
1288                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
1289                         reg_offset = (row * row_stride) + (gpc_index / gpcs_per_reg);
1290                         alpha_mask = beta_mask = 0;
1291
1292                         for (pes_index = 0; pes_index < num_pes_per_gpc; pes_index++) {
1293                                 tpc_count_pes = gr->pes_tpc_count[pes_index][gpc_index];
1294
1295                                 if (assign_alpha) {
1296                                         alpha_bits = (alpha_target == 0) ? 0 : tpc_count_pes;
1297                                         beta_bits = tpc_count_pes - alpha_bits;
1298                                 } else {
1299                                         beta_bits = (beta_target == 0) ? 0 : tpc_count_pes;
1300                                         alpha_bits = tpc_count_pes - beta_bits;
1301                                 }
1302
1303                                 partial_mask = gr->pes_tpc_mask[pes_index][gpc_index];
1304                                 partial_mask = clear_count_bits(partial_mask, tpc_count_pes - alpha_bits);
1305                                 alpha_mask |= partial_mask;
1306
1307                                 partial_mask = gr->pes_tpc_mask[pes_index][gpc_index] ^ partial_mask;
1308                                 beta_mask |= partial_mask;
1309
1310                                 alpha_target -= min(alpha_bits, alpha_target);
1311                                 beta_target -= min(beta_bits, beta_target);
1312
1313                                 if ((alpha_bits > 0) || (beta_bits > 0))
1314                                         assign_alpha = !assign_alpha;
1315                         }
1316
1317                         switch (gpc_index % gpcs_per_reg) {
1318                         case 0:
1319                                 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n0_mask_f(alpha_mask);
1320                                 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n0_mask_f(beta_mask);
1321                                 break;
1322                         case 1:
1323                                 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n1_mask_f(alpha_mask);
1324                                 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n1_mask_f(beta_mask);
1325                                 break;
1326                         case 2:
1327                                 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n2_mask_f(alpha_mask);
1328                                 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n2_mask_f(beta_mask);
1329                                 break;
1330                         case 3:
1331                                 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n3_mask_f(alpha_mask);
1332                                 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n3_mask_f(beta_mask);
1333                                 break;
1334                         }
1335                         map_reg_used[reg_offset] = true;
1336                 }
1337         }
1338
1339         for (index = 0; index < gr_pd_alpha_ratio_table__size_1_v(); index++) {
1340                 if (map_reg_used[index]) {
1341                         gk20a_writel(g, gr_pd_alpha_ratio_table_r(index), map_alpha[index]);
1342                         gk20a_writel(g, gr_pd_beta_ratio_table_r(index), map_beta[index]);
1343                 }
1344         }
1345
1346         return 0;
1347 }
1348
1349 static int gr_gk20a_ctx_state_floorsweep(struct gk20a *g)
1350 {
1351         struct gr_gk20a *gr = &g->gr;
1352         u32 tpc_index, gpc_index;
1353         u32 tpc_offset, gpc_offset;
1354         u32 sm_id = 0, gpc_id = 0;
1355         u32 sm_id_to_gpc_id[proj_scal_max_gpcs_v() * proj_scal_max_tpc_per_gpc_v()];
1356         u32 tpc_per_gpc;
1357         u32 max_ways_evict = INVALID_MAX_WAYS;
1358         u32 l1c_dbg_reg_val;
1359
1360         gk20a_dbg_fn("");
1361
1362         for (tpc_index = 0; tpc_index < gr->max_tpc_per_gpc_count; tpc_index++) {
1363                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
1364                         gpc_offset = proj_gpc_stride_v() * gpc_index;
1365                         if (tpc_index < gr->gpc_tpc_count[gpc_index]) {
1366                                 tpc_offset = proj_tpc_in_gpc_stride_v() * tpc_index;
1367
1368                                 gk20a_writel(g, gr_gpc0_tpc0_sm_cfg_r() + gpc_offset + tpc_offset,
1369                                              gr_gpc0_tpc0_sm_cfg_sm_id_f(sm_id));
1370                                 gk20a_writel(g, gr_gpc0_tpc0_l1c_cfg_smid_r() + gpc_offset + tpc_offset,
1371                                              gr_gpc0_tpc0_l1c_cfg_smid_value_f(sm_id));
1372                                 gk20a_writel(g, gr_gpc0_gpm_pd_sm_id_r(tpc_index) + gpc_offset,
1373                                              gr_gpc0_gpm_pd_sm_id_id_f(sm_id));
1374                                 gk20a_writel(g, gr_gpc0_tpc0_pe_cfg_smid_r() + gpc_offset + tpc_offset,
1375                                              gr_gpc0_tpc0_pe_cfg_smid_value_f(sm_id));
1376
1377                                 sm_id_to_gpc_id[sm_id] = gpc_index;
1378                                 sm_id++;
1379                         }
1380
1381                         gk20a_writel(g, gr_gpc0_gpm_pd_active_tpcs_r() + gpc_offset,
1382                                      gr_gpc0_gpm_pd_active_tpcs_num_f(gr->gpc_tpc_count[gpc_index]));
1383                         gk20a_writel(g, gr_gpc0_gpm_sd_active_tpcs_r() + gpc_offset,
1384                                      gr_gpc0_gpm_sd_active_tpcs_num_f(gr->gpc_tpc_count[gpc_index]));
1385                 }
1386         }
1387
1388         for (tpc_index = 0, gpc_id = 0;
1389              tpc_index < gr_pd_num_tpc_per_gpc__size_1_v();
1390              tpc_index++, gpc_id += 8) {
1391
1392                 if (gpc_id >= gr->gpc_count)
1393                         gpc_id = 0;
1394
1395                 tpc_per_gpc =
1396                         gr_pd_num_tpc_per_gpc_count0_f(gr->gpc_tpc_count[gpc_id + 0]) |
1397                         gr_pd_num_tpc_per_gpc_count1_f(gr->gpc_tpc_count[gpc_id + 1]) |
1398                         gr_pd_num_tpc_per_gpc_count2_f(gr->gpc_tpc_count[gpc_id + 2]) |
1399                         gr_pd_num_tpc_per_gpc_count3_f(gr->gpc_tpc_count[gpc_id + 3]) |
1400                         gr_pd_num_tpc_per_gpc_count4_f(gr->gpc_tpc_count[gpc_id + 4]) |
1401                         gr_pd_num_tpc_per_gpc_count5_f(gr->gpc_tpc_count[gpc_id + 5]) |
1402                         gr_pd_num_tpc_per_gpc_count6_f(gr->gpc_tpc_count[gpc_id + 6]) |
1403                         gr_pd_num_tpc_per_gpc_count7_f(gr->gpc_tpc_count[gpc_id + 7]);
1404
1405                 gk20a_writel(g, gr_pd_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
1406                 gk20a_writel(g, gr_ds_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
1407         }
1408
1409         /* gr__setup_pd_mapping stubbed for gk20a */
1410         gr_gk20a_setup_rop_mapping(g, gr);
1411         if (g->ops.gr.setup_alpha_beta_tables)
1412                 g->ops.gr.setup_alpha_beta_tables(g, gr);
1413
1414         if (gr->num_fbps == 1)
1415                 max_ways_evict = 9;
1416
1417         if (max_ways_evict != INVALID_MAX_WAYS)
1418                 g->ops.ltc.set_max_ways_evict_last(g, max_ways_evict);
1419
1420         for (gpc_index = 0;
1421              gpc_index < gr_pd_dist_skip_table__size_1_v() * 4;
1422              gpc_index += 4) {
1423
1424                 gk20a_writel(g, gr_pd_dist_skip_table_r(gpc_index/4),
1425                              gr_pd_dist_skip_table_gpc_4n0_mask_f(gr->gpc_skip_mask[gpc_index]) ||
1426                              gr_pd_dist_skip_table_gpc_4n1_mask_f(gr->gpc_skip_mask[gpc_index + 1]) ||
1427                              gr_pd_dist_skip_table_gpc_4n2_mask_f(gr->gpc_skip_mask[gpc_index + 2]) ||
1428                              gr_pd_dist_skip_table_gpc_4n3_mask_f(gr->gpc_skip_mask[gpc_index + 3]));
1429         }
1430
1431         gk20a_writel(g, gr_cwd_fs_r(),
1432                      gr_cwd_fs_num_gpcs_f(gr->gpc_count) |
1433                      gr_cwd_fs_num_tpcs_f(gr->tpc_count));
1434
1435         gk20a_writel(g, gr_bes_zrop_settings_r(),
1436                      gr_bes_zrop_settings_num_active_fbps_f(gr->num_fbps));
1437         gk20a_writel(g, gr_bes_crop_settings_r(),
1438                      gr_bes_crop_settings_num_active_fbps_f(gr->num_fbps));
1439
1440         /* turn on cya15 bit for a default val that missed the cut */
1441         l1c_dbg_reg_val = gk20a_readl(g, gr_gpc0_tpc0_l1c_dbg_r());
1442         l1c_dbg_reg_val |= gr_gpc0_tpc0_l1c_dbg_cya15_en_f();
1443         gk20a_writel(g, gr_gpc0_tpc0_l1c_dbg_r(), l1c_dbg_reg_val);
1444
1445         return 0;
1446 }
1447
1448 static int gr_gk20a_fecs_ctx_image_save(struct channel_gk20a *c, u32 save_type)
1449 {
1450         struct gk20a *g = c->g;
1451         int ret;
1452
1453         u32 inst_base_ptr =
1454                 u64_lo32(c->inst_block.cpu_pa
1455                 >> ram_in_base_shift_v());
1456
1457
1458         gk20a_dbg_fn("");
1459
1460         ret = gr_gk20a_submit_fecs_method_op(g,
1461                 (struct fecs_method_op_gk20a) {
1462                 .method.addr = save_type,
1463                 .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
1464                                 gr_fecs_current_ctx_target_vid_mem_f() |
1465                                 gr_fecs_current_ctx_valid_f(1)),
1466                 .mailbox = {.id = 0, .data = 0, .clr = 3, .ret = NULL,
1467                         .ok = 1, .fail = 2,
1468                 },
1469                 .cond.ok = GR_IS_UCODE_OP_AND,
1470                 .cond.fail = GR_IS_UCODE_OP_AND,
1471                  });
1472
1473         if (ret)
1474                 gk20a_err(dev_from_gk20a(g), "save context image failed");
1475
1476         return ret;
1477 }
1478
1479 static u32 gk20a_init_sw_bundle(struct gk20a *g)
1480 {
1481         struct av_list_gk20a *sw_bundle_init = &g->gr.ctx_vars.sw_bundle_init;
1482         u32 last_bundle_data = 0;
1483         u32 err = 0;
1484         int i;
1485         unsigned long end_jiffies = jiffies +
1486                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
1487
1488         /* enable pipe mode override */
1489         gk20a_writel(g, gr_pipe_bundle_config_r(),
1490                 gr_pipe_bundle_config_override_pipe_mode_enabled_f());
1491
1492         /* load bundle init */
1493         for (i = 0; i < sw_bundle_init->count; i++) {
1494                 err |= gr_gk20a_wait_fe_idle(g, end_jiffies,
1495                                         GR_IDLE_CHECK_DEFAULT);
1496                 if (i == 0 || last_bundle_data != sw_bundle_init->l[i].value) {
1497                         gk20a_writel(g, gr_pipe_bundle_data_r(),
1498                                 sw_bundle_init->l[i].value);
1499                         last_bundle_data = sw_bundle_init->l[i].value;
1500                 }
1501
1502                 gk20a_writel(g, gr_pipe_bundle_address_r(),
1503                              sw_bundle_init->l[i].addr);
1504
1505                 if (gr_pipe_bundle_address_value_v(sw_bundle_init->l[i].addr) ==
1506                     GR_GO_IDLE_BUNDLE)
1507                         err |= gr_gk20a_wait_idle(g, end_jiffies,
1508                                         GR_IDLE_CHECK_DEFAULT);
1509         }
1510
1511         /* disable pipe mode override */
1512         gk20a_writel(g, gr_pipe_bundle_config_r(),
1513                      gr_pipe_bundle_config_override_pipe_mode_disabled_f());
1514
1515         return err;
1516 }
1517
1518 /* init global golden image from a fresh gr_ctx in channel ctx.
1519    save a copy in local_golden_image in ctx_vars */
1520 static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
1521                                           struct channel_gk20a *c)
1522 {
1523         struct gr_gk20a *gr = &g->gr;
1524         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1525         u32 ctx_header_bytes = ctxsw_prog_fecs_header_v();
1526         u32 ctx_header_words;
1527         u32 i;
1528         u32 data;
1529         void *ctx_ptr = NULL;
1530         void *gold_ptr = NULL;
1531         u32 err = 0;
1532
1533         gk20a_dbg_fn("");
1534
1535         /* golden ctx is global to all channels. Although only the first
1536            channel initializes golden image, driver needs to prevent multiple
1537            channels from initializing golden ctx at the same time */
1538         mutex_lock(&gr->ctx_mutex);
1539
1540         if (gr->ctx_vars.golden_image_initialized)
1541                 goto clean_up;
1542
1543         err = gr_gk20a_fecs_ctx_bind_channel(g, c);
1544         if (err)
1545                 goto clean_up;
1546
1547         err = gk20a_init_sw_bundle(g);
1548         if (err)
1549                 goto clean_up;
1550
1551         err = gr_gk20a_elpg_protected_call(g,
1552                         gr_gk20a_commit_global_ctx_buffers(g, c, false));
1553         if (err)
1554                 goto clean_up;
1555
1556         gold_ptr = vmap(gr->global_ctx_buffer[GOLDEN_CTX].pages,
1557                         PAGE_ALIGN(gr->global_ctx_buffer[GOLDEN_CTX].size) >>
1558                         PAGE_SHIFT, 0, pgprot_dmacoherent(PAGE_KERNEL));
1559         if (!gold_ptr)
1560                 goto clean_up;
1561
1562         ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
1563                         PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
1564                         0, pgprot_dmacoherent(PAGE_KERNEL));
1565         if (!ctx_ptr)
1566                 goto clean_up;
1567
1568         ctx_header_words =  roundup(ctx_header_bytes, sizeof(u32));
1569         ctx_header_words >>= 2;
1570
1571         gk20a_mm_l2_flush(g, true);
1572
1573         for (i = 0; i < ctx_header_words; i++) {
1574                 data = gk20a_mem_rd32(ctx_ptr, i);
1575                 gk20a_mem_wr32(gold_ptr, i, data);
1576         }
1577
1578         gk20a_mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_o(), 0,
1579                  ctxsw_prog_main_image_zcull_mode_no_ctxsw_v());
1580
1581         gk20a_mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, 0);
1582
1583         gr_gk20a_commit_inst(c, ch_ctx->global_ctx_buffer_va[GOLDEN_CTX_VA]);
1584
1585         gr_gk20a_fecs_ctx_image_save(c, gr_fecs_method_push_adr_wfi_golden_save_v());
1586
1587         if (gr->ctx_vars.local_golden_image == NULL) {
1588
1589                 gr->ctx_vars.local_golden_image =
1590                         kzalloc(gr->ctx_vars.golden_image_size, GFP_KERNEL);
1591
1592                 if (gr->ctx_vars.local_golden_image == NULL) {
1593                         err = -ENOMEM;
1594                         goto clean_up;
1595                 }
1596
1597                 for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
1598                         gr->ctx_vars.local_golden_image[i] =
1599                                 gk20a_mem_rd32(gold_ptr, i);
1600         }
1601
1602         gr_gk20a_commit_inst(c, ch_ctx->gr_ctx.gpu_va);
1603
1604         gr->ctx_vars.golden_image_initialized = true;
1605
1606         gk20a_writel(g, gr_fecs_current_ctx_r(),
1607                 gr_fecs_current_ctx_valid_false_f());
1608
1609 clean_up:
1610         if (err)
1611                 gk20a_err(dev_from_gk20a(g), "fail");
1612         else
1613                 gk20a_dbg_fn("done");
1614
1615         if (gold_ptr)
1616                 vunmap(gold_ptr);
1617         if (ctx_ptr)
1618                 vunmap(ctx_ptr);
1619
1620         mutex_unlock(&gr->ctx_mutex);
1621         return err;
1622 }
1623
1624 int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
1625                                     struct channel_gk20a *c,
1626                                     bool enable_smpc_ctxsw)
1627 {
1628         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1629         void *ctx_ptr = NULL;
1630         u32 data;
1631
1632         /* Channel gr_ctx buffer is gpu cacheable.
1633            Flush and invalidate before cpu update. */
1634         gk20a_mm_l2_flush(g, true);
1635
1636         ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
1637                         PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
1638                         0, pgprot_dmacoherent(PAGE_KERNEL));
1639         if (!ctx_ptr)
1640                 return -ENOMEM;
1641
1642         data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
1643         data = data & ~ctxsw_prog_main_image_pm_smpc_mode_m();
1644         data |= enable_smpc_ctxsw ?
1645                 ctxsw_prog_main_image_pm_smpc_mode_ctxsw_f() :
1646                 ctxsw_prog_main_image_pm_smpc_mode_no_ctxsw_f();
1647         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0,
1648                  data);
1649
1650         vunmap(ctx_ptr);
1651
1652         return 0;
1653 }
1654
1655 /* load saved fresh copy of gloden image into channel gr_ctx */
1656 static int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
1657                                         struct channel_gk20a *c)
1658 {
1659         struct gr_gk20a *gr = &g->gr;
1660         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1661         u32 virt_addr_lo;
1662         u32 virt_addr_hi;
1663         u32 i, v, data;
1664         int ret = 0;
1665         void *ctx_ptr = NULL;
1666
1667         gk20a_dbg_fn("");
1668
1669         if (gr->ctx_vars.local_golden_image == NULL)
1670                 return -1;
1671
1672         /* Channel gr_ctx buffer is gpu cacheable.
1673            Flush and invalidate before cpu update. */
1674         gk20a_mm_l2_flush(g, true);
1675
1676         ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
1677                         PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
1678                         0, pgprot_dmacoherent(PAGE_KERNEL));
1679         if (!ctx_ptr)
1680                 return -ENOMEM;
1681
1682         for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
1683                 gk20a_mem_wr32(ctx_ptr, i, gr->ctx_vars.local_golden_image[i]);
1684
1685         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_save_ops_o(), 0, 0);
1686         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_restore_ops_o(), 0, 0);
1687
1688         virt_addr_lo = u64_lo32(ch_ctx->patch_ctx.gpu_va);
1689         virt_addr_hi = u64_hi32(ch_ctx->patch_ctx.gpu_va);
1690
1691         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_count_o(), 0,
1692                  ch_ctx->patch_ctx.data_count);
1693         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_lo_o(), 0,
1694                  virt_addr_lo);
1695         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_hi_o(), 0,
1696                  virt_addr_hi);
1697
1698         /* no user for client managed performance counter ctx */
1699         ch_ctx->pm_ctx.ctx_sw_mode =
1700                 ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
1701         data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
1702         data = data & ~ctxsw_prog_main_image_pm_mode_m();
1703         data |= ch_ctx->pm_ctx.ctx_sw_mode;
1704         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0,
1705                  data);
1706
1707         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_o(), 0, 0);
1708
1709         /* set priv access map */
1710         virt_addr_lo =
1711                  u64_lo32(ch_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
1712         virt_addr_hi =
1713                  u64_hi32(ch_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
1714
1715         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_config_o(), 0,
1716                  ctxsw_prog_main_image_priv_access_map_config_mode_use_map_f());
1717         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_lo_o(), 0,
1718                  virt_addr_lo);
1719         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_hi_o(), 0,
1720                  virt_addr_hi);
1721         /* disable verif features */
1722         v = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0);
1723         v = v & ~(ctxsw_prog_main_image_misc_options_verif_features_m());
1724         v = v | ctxsw_prog_main_image_misc_options_verif_features_disabled_f();
1725         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0, v);
1726
1727
1728         vunmap(ctx_ptr);
1729
1730         if (tegra_platform_is_linsim()) {
1731                 u32 inst_base_ptr =
1732                         u64_lo32(c->inst_block.cpu_pa
1733                         >> ram_in_base_shift_v());
1734
1735                 ret = gr_gk20a_submit_fecs_method_op(g,
1736                           (struct fecs_method_op_gk20a) {
1737                                   .method.data =
1738                                           (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
1739                                            gr_fecs_current_ctx_target_vid_mem_f() |
1740                                            gr_fecs_current_ctx_valid_f(1)),
1741                                   .method.addr =
1742                                           gr_fecs_method_push_adr_restore_golden_v(),
1743                                   .mailbox = {
1744                                           .id = 0, .data = 0,
1745                                           .clr = ~0, .ret = NULL,
1746                                           .ok = gr_fecs_ctxsw_mailbox_value_pass_v(),
1747                                           .fail = 0},
1748                                   .cond.ok = GR_IS_UCODE_OP_EQUAL,
1749                                   .cond.fail = GR_IS_UCODE_OP_SKIP});
1750
1751                 if (ret)
1752                         gk20a_err(dev_from_gk20a(g),
1753                                    "restore context image failed");
1754         }
1755
1756         return ret;
1757 }
1758
1759 static void gr_gk20a_start_falcon_ucode(struct gk20a *g)
1760 {
1761         gk20a_dbg_fn("");
1762
1763         gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
1764                      gr_fecs_ctxsw_mailbox_clear_value_f(~0));
1765
1766         gk20a_writel(g, gr_gpccs_dmactl_r(), gr_gpccs_dmactl_require_ctx_f(0));
1767         gk20a_writel(g, gr_fecs_dmactl_r(), gr_fecs_dmactl_require_ctx_f(0));
1768
1769         gk20a_writel(g, gr_gpccs_cpuctl_r(), gr_gpccs_cpuctl_startcpu_f(1));
1770         gk20a_writel(g, gr_fecs_cpuctl_r(), gr_fecs_cpuctl_startcpu_f(1));
1771
1772         gk20a_dbg_fn("done");
1773 }
1774
1775 static int gr_gk20a_init_ctxsw_ucode_vaspace(struct gk20a *g)
1776 {
1777         struct mm_gk20a *mm = &g->mm;
1778         struct vm_gk20a *vm = &mm->pmu.vm;
1779         struct device *d = dev_from_gk20a(g);
1780         struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
1781         void *inst_ptr;
1782         u32 pde_addr_lo;
1783         u32 pde_addr_hi;
1784         u64 pde_addr;
1785         dma_addr_t iova;
1786
1787         /* Alloc mem of inst block */
1788         ucode_info->inst_blk_desc.size = ram_in_alloc_size_v();
1789         ucode_info->inst_blk_desc.cpuva = dma_alloc_coherent(d,
1790                                         ucode_info->inst_blk_desc.size,
1791                                         &iova,
1792                                         GFP_KERNEL);
1793         if (!ucode_info->inst_blk_desc.cpuva) {
1794                 gk20a_err(d, "failed to allocate memory\n");
1795                 return -ENOMEM;
1796         }
1797
1798         ucode_info->inst_blk_desc.iova = iova;
1799         ucode_info->inst_blk_desc.cpu_pa = gk20a_get_phys_from_iova(d,
1800                                         ucode_info->inst_blk_desc.iova);
1801
1802         inst_ptr = ucode_info->inst_blk_desc.cpuva;
1803
1804         /* Set inst block */
1805         gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
1806                  u64_lo32(vm->va_limit) | 0xFFF);
1807         gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
1808                 ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit)));
1809
1810         pde_addr = gk20a_mm_iova_addr(vm->pdes.sgt->sgl);
1811         pde_addr_lo = u64_lo32(pde_addr >> 12);
1812         pde_addr_hi = u64_hi32(pde_addr);
1813         gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
1814                 ram_in_page_dir_base_target_vid_mem_f() |
1815                 ram_in_page_dir_base_vol_true_f() |
1816                 ram_in_page_dir_base_lo_f(pde_addr_lo));
1817         gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
1818                 ram_in_page_dir_base_hi_f(pde_addr_hi));
1819
1820         /* Map ucode surface to GMMU */
1821         ucode_info->ucode_gpuva = gk20a_gmmu_map(vm,
1822                                         &ucode_info->surface_desc.sgt,
1823                                         ucode_info->surface_desc.size,
1824                                         0, /* flags */
1825                                         gk20a_mem_flag_read_only);
1826         if (!ucode_info->ucode_gpuva) {
1827                 gk20a_err(d, "failed to update gmmu ptes\n");
1828                 return -ENOMEM;
1829         }
1830
1831         return 0;
1832 }
1833
1834 static void gr_gk20a_init_ctxsw_ucode_segment(
1835         struct gk20a_ctxsw_ucode_segment *p_seg, u32 *offset, u32 size)
1836 {
1837         p_seg->offset = *offset;
1838         p_seg->size = size;
1839         *offset = ALIGN(*offset + size, BLK_SIZE);
1840 }
1841
1842 static void gr_gk20a_init_ctxsw_ucode_segments(
1843         struct gk20a_ctxsw_ucode_segments *segments, u32 *offset,
1844         struct gk20a_ctxsw_bootloader_desc *bootdesc,
1845         u32 code_size, u32 data_size)
1846 {
1847         u32 boot_size = ALIGN(bootdesc->size, sizeof(u32));
1848         segments->boot_entry = bootdesc->entry_point;
1849         segments->boot_imem_offset = bootdesc->imem_offset;
1850         gr_gk20a_init_ctxsw_ucode_segment(&segments->boot, offset, boot_size);
1851         gr_gk20a_init_ctxsw_ucode_segment(&segments->code, offset, code_size);
1852         gr_gk20a_init_ctxsw_ucode_segment(&segments->data, offset, data_size);
1853 }
1854
1855 static int gr_gk20a_copy_ctxsw_ucode_segments(
1856         u8 *buf,
1857         struct gk20a_ctxsw_ucode_segments *segments,
1858         u32 *bootimage,
1859         u32 *code, u32 *data)
1860 {
1861         memcpy(buf + segments->boot.offset, bootimage, segments->boot.size);
1862         memcpy(buf + segments->code.offset, code,      segments->code.size);
1863         memcpy(buf + segments->data.offset, data,      segments->data.size);
1864         return 0;
1865 }
1866
1867 static int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
1868 {
1869         struct device *d = dev_from_gk20a(g);
1870         struct mm_gk20a *mm = &g->mm;
1871         struct vm_gk20a *vm = &mm->pmu.vm;
1872         struct gk20a_ctxsw_bootloader_desc *fecs_boot_desc;
1873         struct gk20a_ctxsw_bootloader_desc *gpccs_boot_desc;
1874         const struct firmware *fecs_fw;
1875         const struct firmware *gpccs_fw;
1876         u32 *fecs_boot_image;
1877         u32 *gpccs_boot_image;
1878         struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
1879         u8 *buf;
1880         u32 ucode_size;
1881         int err = 0;
1882         dma_addr_t iova;
1883         DEFINE_DMA_ATTRS(attrs);
1884
1885         fecs_fw = gk20a_request_firmware(g, GK20A_FECS_UCODE_IMAGE);
1886         if (!fecs_fw) {
1887                 gk20a_err(d, "failed to load fecs ucode!!");
1888                 return -ENOENT;
1889         }
1890
1891         fecs_boot_desc = (void *)fecs_fw->data;
1892         fecs_boot_image = (void *)(fecs_fw->data +
1893                                 sizeof(struct gk20a_ctxsw_bootloader_desc));
1894
1895         gpccs_fw = gk20a_request_firmware(g, GK20A_GPCCS_UCODE_IMAGE);
1896         if (!gpccs_fw) {
1897                 release_firmware(fecs_fw);
1898                 gk20a_err(d, "failed to load gpccs ucode!!");
1899                 return -ENOENT;
1900         }
1901
1902         gpccs_boot_desc = (void *)gpccs_fw->data;
1903         gpccs_boot_image = (void *)(gpccs_fw->data +
1904                                 sizeof(struct gk20a_ctxsw_bootloader_desc));
1905
1906         ucode_size = 0;
1907         gr_gk20a_init_ctxsw_ucode_segments(&ucode_info->fecs, &ucode_size,
1908                 fecs_boot_desc,
1909                 g->gr.ctx_vars.ucode.fecs.inst.count * sizeof(u32),
1910                 g->gr.ctx_vars.ucode.fecs.data.count * sizeof(u32));
1911         gr_gk20a_init_ctxsw_ucode_segments(&ucode_info->gpccs, &ucode_size,
1912                 gpccs_boot_desc,
1913                 g->gr.ctx_vars.ucode.gpccs.inst.count * sizeof(u32),
1914                 g->gr.ctx_vars.ucode.gpccs.data.count * sizeof(u32));
1915
1916         ucode_info->surface_desc.size = ucode_size;
1917         dma_set_attr(DMA_ATTR_READ_ONLY, &attrs);
1918         ucode_info->surface_desc.cpuva = dma_alloc_attrs(d,
1919                                         ucode_info->surface_desc.size,
1920                                         &iova,
1921                                         GFP_KERNEL,
1922                                         &attrs);
1923         if (!ucode_info->surface_desc.cpuva) {
1924                 gk20a_err(d, "memory allocation failed\n");
1925                 err = -ENOMEM;
1926                 goto clean_up;
1927         }
1928
1929         ucode_info->surface_desc.iova = iova;
1930         err = gk20a_get_sgtable(d, &ucode_info->surface_desc.sgt,
1931                                 ucode_info->surface_desc.cpuva,
1932                                 ucode_info->surface_desc.iova,
1933                                 ucode_info->surface_desc.size);
1934         if (err) {
1935                 gk20a_err(d, "failed to create sg table\n");
1936                 goto clean_up;
1937         }
1938
1939         buf = (u8 *)ucode_info->surface_desc.cpuva;
1940         if (!buf) {
1941                 gk20a_err(d, "failed to map surface desc buffer");
1942                 err = -ENOMEM;
1943                 goto clean_up;
1944         }
1945
1946         gr_gk20a_copy_ctxsw_ucode_segments(buf, &ucode_info->fecs,
1947                 fecs_boot_image,
1948                 g->gr.ctx_vars.ucode.fecs.inst.l,
1949                 g->gr.ctx_vars.ucode.fecs.data.l);
1950
1951         release_firmware(fecs_fw);
1952         fecs_fw = NULL;
1953
1954         gr_gk20a_copy_ctxsw_ucode_segments(buf, &ucode_info->gpccs,
1955                 gpccs_boot_image,
1956                 g->gr.ctx_vars.ucode.gpccs.inst.l,
1957                 g->gr.ctx_vars.ucode.gpccs.data.l);
1958
1959         release_firmware(gpccs_fw);
1960         gpccs_fw = NULL;
1961
1962         err = gr_gk20a_init_ctxsw_ucode_vaspace(g);
1963         if (err)
1964                 goto clean_up;
1965
1966         gk20a_free_sgtable(&ucode_info->surface_desc.sgt);
1967
1968         return 0;
1969
1970  clean_up:
1971         if (ucode_info->ucode_gpuva)
1972                 gk20a_gmmu_unmap(vm, ucode_info->ucode_gpuva,
1973                         ucode_info->surface_desc.size, gk20a_mem_flag_none);
1974         if (ucode_info->surface_desc.sgt)
1975                 gk20a_free_sgtable(&ucode_info->surface_desc.sgt);
1976         if (ucode_info->surface_desc.cpuva)
1977                 dma_free_attrs(d, ucode_info->surface_desc.size,
1978                                 ucode_info->surface_desc.cpuva,
1979                                 ucode_info->surface_desc.iova,
1980                                 &attrs);
1981         ucode_info->surface_desc.cpuva = NULL;
1982         ucode_info->surface_desc.iova = 0;
1983
1984         release_firmware(gpccs_fw);
1985         gpccs_fw = NULL;
1986         release_firmware(fecs_fw);
1987         fecs_fw = NULL;
1988
1989         return err;
1990 }
1991
1992 static void gr_gk20a_load_falcon_bind_instblk(struct gk20a *g)
1993 {
1994         struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
1995         int retries = 20;
1996         phys_addr_t inst_ptr;
1997         u32 val;
1998
1999         while ((gk20a_readl(g, gr_fecs_ctxsw_status_1_r()) &
2000                         gr_fecs_ctxsw_status_1_arb_busy_m()) && retries) {
2001                 udelay(2);
2002                 retries--;
2003         }
2004         if (!retries)
2005                 gk20a_err(dev_from_gk20a(g), "arbiter idle timeout");
2006
2007         gk20a_writel(g, gr_fecs_arb_ctx_adr_r(), 0x0);
2008
2009         inst_ptr = ucode_info->inst_blk_desc.cpu_pa;
2010         gk20a_writel(g, gr_fecs_new_ctx_r(),
2011                         gr_fecs_new_ctx_ptr_f(inst_ptr >> 12) |
2012                         gr_fecs_new_ctx_target_m() |
2013                         gr_fecs_new_ctx_valid_m());
2014
2015         gk20a_writel(g, gr_fecs_arb_ctx_ptr_r(),
2016                         gr_fecs_arb_ctx_ptr_ptr_f(inst_ptr >> 12) |
2017                         gr_fecs_arb_ctx_ptr_target_m());
2018
2019         gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), 0x7);
2020
2021         /* Wait for arbiter command to complete */
2022         retries = 20;
2023         val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
2024         while (gr_fecs_arb_ctx_cmd_cmd_v(val) && retries) {
2025                 udelay(2);
2026                 retries--;
2027                 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
2028         }
2029         if (!retries)
2030                 gk20a_err(dev_from_gk20a(g), "arbiter complete timeout");
2031
2032         gk20a_writel(g, gr_fecs_current_ctx_r(),
2033                         gr_fecs_current_ctx_ptr_f(inst_ptr >> 12) |
2034                         gr_fecs_current_ctx_target_m() |
2035                         gr_fecs_current_ctx_valid_m());
2036         /* Send command to arbiter to flush */
2037         gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), gr_fecs_arb_ctx_cmd_cmd_s());
2038
2039         retries = 20;
2040         val = (gk20a_readl(g, gr_fecs_arb_ctx_cmd_r()));
2041         while (gr_fecs_arb_ctx_cmd_cmd_v(val) && retries) {
2042                 udelay(2);
2043                 retries--;
2044                 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
2045         }
2046         if (!retries)
2047                 gk20a_err(dev_from_gk20a(g), "arbiter complete timeout");
2048 }
2049
2050 static int gr_gk20a_load_ctxsw_ucode_segments(struct gk20a *g, u64 addr_base,
2051         struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset)
2052 {
2053         u32 addr_code32;
2054         u32 addr_data32;
2055         u32 addr_load32;
2056         u32 dst = 0;
2057         u32 blocks;
2058         u32 b;
2059
2060         addr_code32 = u64_lo32((addr_base + segments->code.offset) >> 8);
2061         addr_data32 = u64_lo32((addr_base + segments->data.offset) >> 8);
2062         addr_load32 = u64_lo32((addr_base + segments->boot.offset) >> 8);
2063
2064         gk20a_writel(g, reg_offset + gr_fecs_dmactl_r(),
2065                         gr_fecs_dmactl_require_ctx_f(0));
2066
2067         /*
2068          * Copy falcon bootloader header into dmem at offset 0.
2069          * Configure dmem port 0 for auto-incrementing writes starting at dmem
2070          * offset 0.
2071          */
2072         gk20a_writel(g, reg_offset + gr_fecs_dmemc_r(0),
2073                         gr_fecs_dmemc_offs_f(0) |
2074                         gr_fecs_dmemc_blk_f(0) |
2075                         gr_fecs_dmemc_aincw_f(1));
2076
2077         /* Write out the actual data */
2078         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2079         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_code32);
2080         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2081         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), segments->code.size);
2082         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2083         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_data32);
2084         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), segments->data.size);
2085         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_code32);
2086         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2087         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2088
2089         blocks = ((segments->boot.size + 0xFF) & ~0xFF) >> 8;
2090
2091         /*
2092          * Set the base FB address for the DMA transfer. Subtract off the 256
2093          * byte IMEM block offset such that the relative FB and IMEM offsets
2094          * match, allowing the IMEM tags to be properly created.
2095          */
2096
2097         dst = segments->boot_imem_offset;
2098         gk20a_writel(g, reg_offset + gr_fecs_dmatrfbase_r(),
2099                         (addr_load32 - (dst >> 8)));
2100
2101         for (b = 0; b < blocks; b++) {
2102                 /* Setup destination IMEM offset */
2103                 gk20a_writel(g, reg_offset + gr_fecs_dmatrfmoffs_r(),
2104                                 dst + (b << 8));
2105
2106                 /* Setup source offset (relative to BASE) */
2107                 gk20a_writel(g, reg_offset + gr_fecs_dmatrffboffs_r(),
2108                                 dst + (b << 8));
2109
2110                 gk20a_writel(g, reg_offset + gr_fecs_dmatrfcmd_r(),
2111                                 gr_fecs_dmatrfcmd_imem_f(0x01) |
2112                                 gr_fecs_dmatrfcmd_write_f(0x00) |
2113                                 gr_fecs_dmatrfcmd_size_f(0x06) |
2114                                 gr_fecs_dmatrfcmd_ctxdma_f(0));
2115         }
2116
2117         /* Specify the falcon boot vector */
2118         gk20a_writel(g, reg_offset + gr_fecs_bootvec_r(),
2119                         gr_fecs_bootvec_vec_f(segments->boot_entry));
2120
2121         /* Write to CPUCTL to start the falcon */
2122         gk20a_writel(g, reg_offset + gr_fecs_cpuctl_r(),
2123                         gr_fecs_cpuctl_startcpu_f(0x01));
2124
2125         return 0;
2126 }
2127
2128 static void gr_gk20a_load_falcon_with_bootloader(struct gk20a *g)
2129 {
2130         struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
2131         u64 addr_base = ucode_info->ucode_gpuva;
2132
2133         gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0x0);
2134
2135         gr_gk20a_load_falcon_bind_instblk(g);
2136
2137         gr_gk20a_load_ctxsw_ucode_segments(g, addr_base,
2138                 &g->ctxsw_ucode_info.fecs, 0);
2139
2140         gr_gk20a_load_ctxsw_ucode_segments(g, addr_base,
2141                 &g->ctxsw_ucode_info.gpccs,
2142                 gr_gpcs_gpccs_falcon_hwcfg_r() -
2143                 gr_fecs_falcon_hwcfg_r());
2144 }
2145
2146 static int gr_gk20a_load_ctxsw_ucode(struct gk20a *g, struct gr_gk20a *gr)
2147 {
2148         u32 ret;
2149
2150         gk20a_dbg_fn("");
2151
2152         if (tegra_platform_is_linsim()) {
2153                 gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(7),
2154                         gr_fecs_ctxsw_mailbox_value_f(0xc0de7777));
2155                 gk20a_writel(g, gr_gpccs_ctxsw_mailbox_r(7),
2156                         gr_gpccs_ctxsw_mailbox_value_f(0xc0de7777));
2157         }
2158
2159         /*
2160          * In case the gPMU falcon is not being used, revert to the old way of
2161          * loading gr ucode, without the faster bootstrap routine.
2162          */
2163         if (!support_gk20a_pmu()) {
2164                 gr_gk20a_load_falcon_dmem(g);
2165                 gr_gk20a_load_falcon_imem(g);
2166                 gr_gk20a_start_falcon_ucode(g);
2167         } else {
2168                 if (!gr->skip_ucode_init)
2169                         gr_gk20a_init_ctxsw_ucode(g);
2170                 gr_gk20a_load_falcon_with_bootloader(g);
2171                 gr->skip_ucode_init = true;
2172         }
2173
2174         ret = gr_gk20a_ctx_wait_ucode(g, 0, 0,
2175                                       GR_IS_UCODE_OP_EQUAL,
2176                                       eUcodeHandshakeInitComplete,
2177                                       GR_IS_UCODE_OP_SKIP, 0);
2178         if (ret) {
2179                 gk20a_err(dev_from_gk20a(g), "falcon ucode init timeout");
2180                 return ret;
2181         }
2182
2183         if (support_gk20a_pmu())
2184                 gk20a_writel(g, gr_fecs_current_ctx_r(),
2185                         gr_fecs_current_ctx_valid_false_f());
2186
2187         gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0xffffffff);
2188         gk20a_writel(g, gr_fecs_method_data_r(), 0x7fffffff);
2189         gk20a_writel(g, gr_fecs_method_push_r(),
2190                      gr_fecs_method_push_adr_set_watchdog_timeout_f());
2191
2192         gk20a_dbg_fn("done");
2193         return 0;
2194 }
2195
2196 static int gr_gk20a_init_ctx_state(struct gk20a *g, struct gr_gk20a *gr)
2197 {
2198         u32 golden_ctx_image_size = 0;
2199         u32 zcull_ctx_image_size = 0;
2200         u32 pm_ctx_image_size = 0;
2201         u32 ret;
2202         struct fecs_method_op_gk20a op = {
2203                 .mailbox = { .id = 0, .data = 0,
2204                              .clr = ~0, .ok = 0, .fail = 0},
2205                 .method.data = 0,
2206                 .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
2207                 .cond.fail = GR_IS_UCODE_OP_SKIP,
2208                 };
2209
2210         gk20a_dbg_fn("");
2211         op.method.addr = gr_fecs_method_push_adr_discover_image_size_v();
2212         op.mailbox.ret = &golden_ctx_image_size;
2213         ret = gr_gk20a_submit_fecs_method_op(g, op);
2214         if (ret) {
2215                 gk20a_err(dev_from_gk20a(g),
2216                            "query golden image size failed");
2217                 return ret;
2218         }
2219         op.method.addr = gr_fecs_method_push_adr_discover_zcull_image_size_v();
2220         op.mailbox.ret = &zcull_ctx_image_size;
2221         ret = gr_gk20a_submit_fecs_method_op(g, op);
2222         if (ret) {
2223                 gk20a_err(dev_from_gk20a(g),
2224                            "query zcull ctx image size failed");
2225                 return ret;
2226         }
2227         op.method.addr = gr_fecs_method_push_adr_discover_pm_image_size_v();
2228         op.mailbox.ret = &pm_ctx_image_size;
2229         ret = gr_gk20a_submit_fecs_method_op(g, op);
2230         if (ret) {
2231                 gk20a_err(dev_from_gk20a(g),
2232                            "query pm ctx image size failed");
2233                 return ret;
2234         }
2235
2236         if (!g->gr.ctx_vars.golden_image_size &&
2237             !g->gr.ctx_vars.zcull_ctxsw_image_size) {
2238                 g->gr.ctx_vars.golden_image_size = golden_ctx_image_size;
2239                 g->gr.ctx_vars.zcull_ctxsw_image_size = zcull_ctx_image_size;
2240         } else {
2241                 /* hw is different after railgating? */
2242                 BUG_ON(g->gr.ctx_vars.golden_image_size != golden_ctx_image_size);
2243                 BUG_ON(g->gr.ctx_vars.zcull_ctxsw_image_size != zcull_ctx_image_size);
2244         }
2245
2246         g->gr.ctx_vars.priv_access_map_size = 512 * 1024;
2247
2248         gk20a_dbg_fn("done");
2249         return 0;
2250 }
2251
2252 static void gk20a_gr_destroy_ctx_buffer(struct platform_device *pdev,
2253                                         struct gr_ctx_buffer_desc *desc)
2254 {
2255         struct device *dev = &pdev->dev;
2256         gk20a_free_sgtable(&desc->sgt);
2257         dma_free_attrs(dev, desc->size, desc->pages,
2258                        desc->iova, &desc->attrs);
2259 }
2260
2261 static int gk20a_gr_alloc_ctx_buffer(struct platform_device *pdev,
2262                                      struct gr_ctx_buffer_desc *desc,
2263                                      size_t size)
2264 {
2265         struct device *dev = &pdev->dev;
2266         DEFINE_DMA_ATTRS(attrs);
2267         dma_addr_t iova;
2268         int err = 0;
2269
2270         dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2271
2272         desc->pages = dma_alloc_attrs(&pdev->dev, size, &iova,
2273                                       GFP_KERNEL, &attrs);
2274         if (!desc->pages)
2275                 return -ENOMEM;
2276
2277         desc->iova = iova;
2278         desc->size = size;
2279         desc->attrs = attrs;
2280         desc->destroy = gk20a_gr_destroy_ctx_buffer;
2281         err = gk20a_get_sgtable_from_pages(&pdev->dev, &desc->sgt, desc->pages,
2282                                            desc->iova, desc->size);
2283         if (err) {
2284                 dma_free_attrs(dev, desc->size, desc->pages,
2285                                desc->iova, &desc->attrs);
2286                 memset(desc, 0, sizeof(*desc));
2287         }
2288
2289         return err;
2290 }
2291
2292 static int gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g)
2293 {
2294         struct gk20a_platform *platform = platform_get_drvdata(g->dev);
2295         struct gr_gk20a *gr = &g->gr;
2296         int i, attr_buffer_size, err;
2297         struct platform_device *pdev = g->dev;
2298
2299         u32 cb_buffer_size = gr->bundle_cb_default_size *
2300                 gr_scc_bundle_cb_size_div_256b_byte_granularity_v();
2301
2302         u32 pagepool_buffer_size = gr_scc_pagepool_total_pages_hwmax_value_v() *
2303                 gr_scc_pagepool_total_pages_byte_granularity_v();
2304
2305         gk20a_dbg_fn("");
2306
2307         attr_buffer_size = g->ops.gr.calc_global_ctx_buffer_size(g);
2308
2309         gk20a_dbg_info("cb_buffer_size : %d", cb_buffer_size);
2310
2311         err = gk20a_gr_alloc_ctx_buffer(pdev, &gr->global_ctx_buffer[CIRCULAR],
2312                                         cb_buffer_size);
2313         if (err)
2314                 goto clean_up;
2315
2316         if (platform->secure_alloc)
2317                 platform->secure_alloc(pdev,
2318                                        &gr->global_ctx_buffer[CIRCULAR_VPR],
2319                                        cb_buffer_size);
2320
2321         gk20a_dbg_info("pagepool_buffer_size : %d", pagepool_buffer_size);
2322
2323         err = gk20a_gr_alloc_ctx_buffer(pdev, &gr->global_ctx_buffer[PAGEPOOL],
2324                                         pagepool_buffer_size);
2325         if (err)
2326                 goto clean_up;
2327
2328         if (platform->secure_alloc)
2329                 platform->secure_alloc(pdev,
2330                                        &gr->global_ctx_buffer[PAGEPOOL_VPR],
2331                                        pagepool_buffer_size);
2332
2333         gk20a_dbg_info("attr_buffer_size : %d", attr_buffer_size);
2334
2335         err = gk20a_gr_alloc_ctx_buffer(pdev, &gr->global_ctx_buffer[ATTRIBUTE],
2336                                         attr_buffer_size);
2337         if (err)
2338                 goto clean_up;
2339
2340         if (platform->secure_alloc)
2341                 platform->secure_alloc(pdev,
2342                                        &gr->global_ctx_buffer[ATTRIBUTE_VPR],
2343                                        attr_buffer_size);
2344
2345         if (platform->secure_buffer.destroy)
2346                 platform->secure_buffer.destroy(pdev, &platform->secure_buffer);
2347
2348         gk20a_dbg_info("golden_image_size : %d",
2349                    gr->ctx_vars.golden_image_size);
2350
2351         err = gk20a_gr_alloc_ctx_buffer(pdev,
2352                                         &gr->global_ctx_buffer[GOLDEN_CTX],
2353                                         gr->ctx_vars.golden_image_size);
2354         if (err)
2355                 goto clean_up;
2356
2357         gk20a_dbg_info("priv_access_map_size : %d",
2358                    gr->ctx_vars.priv_access_map_size);
2359
2360         err = gk20a_gr_alloc_ctx_buffer(pdev,
2361                                         &gr->global_ctx_buffer[PRIV_ACCESS_MAP],
2362                                         gr->ctx_vars.priv_access_map_size);
2363
2364         if (err)
2365                 goto clean_up;
2366
2367         gk20a_dbg_fn("done");
2368         return 0;
2369
2370  clean_up:
2371         gk20a_err(dev_from_gk20a(g), "fail");
2372         for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
2373                 if (gr->global_ctx_buffer[i].destroy) {
2374                         gr->global_ctx_buffer[i].destroy(pdev,
2375                                         &gr->global_ctx_buffer[i]);
2376                 }
2377         }
2378         return -ENOMEM;
2379 }
2380
2381 static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g)
2382 {
2383         struct platform_device *pdev = g->dev;
2384         struct gr_gk20a *gr = &g->gr;
2385         DEFINE_DMA_ATTRS(attrs);
2386         u32 i;
2387
2388         dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2389
2390         for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
2391                 gr->global_ctx_buffer[i].destroy(pdev,
2392                                 &gr->global_ctx_buffer[i]);
2393         }
2394
2395         gk20a_dbg_fn("done");
2396 }
2397
2398 static int gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
2399                                         struct channel_gk20a *c)
2400 {
2401         struct vm_gk20a *ch_vm = c->vm;
2402         u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
2403         u64 *g_bfr_size = c->ch_ctx.global_ctx_buffer_size;
2404         struct gr_gk20a *gr = &g->gr;
2405         struct sg_table *sgt;
2406         u64 size;
2407         u64 gpu_va;
2408         u32 i;
2409         gk20a_dbg_fn("");
2410
2411         /* Circular Buffer */
2412         if (!c->vpr || (gr->global_ctx_buffer[CIRCULAR_VPR].sgt == NULL)) {
2413                 sgt = gr->global_ctx_buffer[CIRCULAR].sgt;
2414                 size = gr->global_ctx_buffer[CIRCULAR].size;
2415         } else {
2416                 sgt = gr->global_ctx_buffer[CIRCULAR_VPR].sgt;
2417                 size = gr->global_ctx_buffer[CIRCULAR_VPR].size;
2418         }
2419
2420         gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size,
2421                                 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
2422                                 gk20a_mem_flag_none);
2423         if (!gpu_va)
2424                 goto clean_up;
2425         g_bfr_va[CIRCULAR_VA] = gpu_va;
2426         g_bfr_size[CIRCULAR_VA] = size;
2427
2428         /* Attribute Buffer */
2429         if (!c->vpr || (gr->global_ctx_buffer[ATTRIBUTE_VPR].sgt == NULL)) {
2430                 sgt = gr->global_ctx_buffer[ATTRIBUTE].sgt;
2431                 size = gr->global_ctx_buffer[ATTRIBUTE].size;
2432         } else {
2433                 sgt = gr->global_ctx_buffer[ATTRIBUTE_VPR].sgt;
2434                 size = gr->global_ctx_buffer[ATTRIBUTE_VPR].size;
2435         }
2436
2437         gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size,
2438                                 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
2439                                 gk20a_mem_flag_none);
2440         if (!gpu_va)
2441                 goto clean_up;
2442         g_bfr_va[ATTRIBUTE_VA] = gpu_va;
2443         g_bfr_size[ATTRIBUTE_VA] = size;
2444
2445         /* Page Pool */
2446         if (!c->vpr || (gr->global_ctx_buffer[PAGEPOOL_VPR].sgt == NULL)) {
2447                 sgt = gr->global_ctx_buffer[PAGEPOOL].sgt;
2448                 size = gr->global_ctx_buffer[PAGEPOOL].size;
2449         } else {
2450                 sgt = gr->global_ctx_buffer[PAGEPOOL_VPR].sgt;
2451                 size = gr->global_ctx_buffer[PAGEPOOL_VPR].size;
2452         }
2453
2454         gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size,
2455                                 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
2456                                 gk20a_mem_flag_none);
2457         if (!gpu_va)
2458                 goto clean_up;
2459         g_bfr_va[PAGEPOOL_VA] = gpu_va;
2460         g_bfr_size[PAGEPOOL_VA] = size;
2461
2462         /* Golden Image */
2463         sgt = gr->global_ctx_buffer[GOLDEN_CTX].sgt;
2464         size = gr->global_ctx_buffer[GOLDEN_CTX].size;
2465         gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size, 0,
2466                                 gk20a_mem_flag_none);
2467         if (!gpu_va)
2468                 goto clean_up;
2469         g_bfr_va[GOLDEN_CTX_VA] = gpu_va;
2470         g_bfr_size[GOLDEN_CTX_VA] = size;
2471
2472         /* Priv register Access Map */
2473         sgt = gr->global_ctx_buffer[PRIV_ACCESS_MAP].sgt;
2474         size = gr->global_ctx_buffer[PRIV_ACCESS_MAP].size;
2475         gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size, 0,
2476                                 gk20a_mem_flag_none);
2477         if (!gpu_va)
2478                 goto clean_up;
2479         g_bfr_va[PRIV_ACCESS_MAP_VA] = gpu_va;
2480         g_bfr_size[PRIV_ACCESS_MAP_VA] = size;
2481
2482         c->ch_ctx.global_ctx_buffer_mapped = true;
2483         return 0;
2484
2485  clean_up:
2486         for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
2487                 if (g_bfr_va[i]) {
2488                         gk20a_gmmu_unmap(ch_vm, g_bfr_va[i],
2489                                          gr->global_ctx_buffer[i].size,
2490                                          gk20a_mem_flag_none);
2491                         g_bfr_va[i] = 0;
2492                 }
2493         }
2494         return -ENOMEM;
2495 }
2496
2497 static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c)
2498 {
2499         struct vm_gk20a *ch_vm = c->vm;
2500         u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
2501         u64 *g_bfr_size = c->ch_ctx.global_ctx_buffer_size;
2502         u32 i;
2503
2504         gk20a_dbg_fn("");
2505
2506         for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
2507                 if (g_bfr_va[i]) {
2508                         gk20a_gmmu_unmap(ch_vm, g_bfr_va[i],
2509                                          g_bfr_size[i],
2510                                          gk20a_mem_flag_none);
2511                         g_bfr_va[i] = 0;
2512                         g_bfr_size[i] = 0;
2513                 }
2514         }
2515         c->ch_ctx.global_ctx_buffer_mapped = false;
2516 }
2517
2518 static int gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
2519                                 struct channel_gk20a *c)
2520 {
2521         struct gr_gk20a *gr = &g->gr;
2522         struct gr_ctx_desc *gr_ctx = &c->ch_ctx.gr_ctx;
2523         struct vm_gk20a *ch_vm = c->vm;
2524         struct device *d = dev_from_gk20a(g);
2525         struct sg_table *sgt;
2526         DEFINE_DMA_ATTRS(attrs);
2527         int err = 0;
2528         dma_addr_t iova;
2529
2530         gk20a_dbg_fn("");
2531
2532         if (gr->ctx_vars.buffer_size == 0)
2533                 return 0;
2534
2535         /* alloc channel gr ctx buffer */
2536         gr->ctx_vars.buffer_size = gr->ctx_vars.golden_image_size;
2537         gr->ctx_vars.buffer_total_size = gr->ctx_vars.golden_image_size;
2538
2539         gr_ctx->size = gr->ctx_vars.buffer_total_size;
2540         dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2541         gr_ctx->pages = dma_alloc_attrs(d, gr_ctx->size,
2542                                 &iova, GFP_KERNEL, &attrs);
2543         if (!gr_ctx->pages)
2544                 return -ENOMEM;
2545
2546         gr_ctx->iova = iova;
2547         err = gk20a_get_sgtable_from_pages(d, &sgt, gr_ctx->pages,
2548                         gr_ctx->iova, gr_ctx->size);
2549         if (err)
2550                 goto err_free;
2551
2552         gr_ctx->gpu_va = gk20a_gmmu_map(ch_vm, &sgt, gr_ctx->size,
2553                                 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
2554                                 gk20a_mem_flag_none);
2555         if (!gr_ctx->gpu_va)
2556                 goto err_free_sgt;
2557
2558         gk20a_free_sgtable(&sgt);
2559
2560         return 0;
2561
2562  err_free_sgt:
2563         gk20a_free_sgtable(&sgt);
2564  err_free:
2565         dma_free_attrs(d, gr_ctx->size,
2566                 gr_ctx->pages, gr_ctx->iova, &attrs);
2567         gr_ctx->pages = NULL;
2568         gr_ctx->iova = 0;
2569
2570         return err;
2571 }
2572
2573 static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c)
2574 {
2575         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
2576         struct vm_gk20a *ch_vm = c->vm;
2577         struct gk20a *g = c->g;
2578         struct device *d = dev_from_gk20a(g);
2579         DEFINE_DMA_ATTRS(attrs);
2580
2581         gk20a_dbg_fn("");
2582
2583         if (!ch_ctx->gr_ctx.gpu_va)
2584                 return;
2585
2586         gk20a_gmmu_unmap(ch_vm, ch_ctx->gr_ctx.gpu_va,
2587                         ch_ctx->gr_ctx.size, gk20a_mem_flag_none);
2588         dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2589         dma_free_attrs(d, ch_ctx->gr_ctx.size,
2590                 ch_ctx->gr_ctx.pages, ch_ctx->gr_ctx.iova, &attrs);
2591         ch_ctx->gr_ctx.pages = NULL;
2592         ch_ctx->gr_ctx.iova = 0;
2593 }
2594
2595 static int gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
2596                                 struct channel_gk20a *c)
2597 {
2598         struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2599         struct device *d = dev_from_gk20a(g);
2600         struct vm_gk20a *ch_vm = c->vm;
2601         DEFINE_DMA_ATTRS(attrs);
2602         struct sg_table *sgt;
2603         int err = 0;
2604         dma_addr_t iova;
2605
2606         gk20a_dbg_fn("");
2607
2608         patch_ctx->size = 128 * sizeof(u32);
2609         dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2610         patch_ctx->pages = dma_alloc_attrs(d, patch_ctx->size,
2611                                 &iova, GFP_KERNEL,
2612                                 &attrs);
2613         if (!patch_ctx->pages)
2614                 return -ENOMEM;
2615
2616         patch_ctx->iova = iova;
2617         err = gk20a_get_sgtable_from_pages(d, &sgt, patch_ctx->pages,
2618                         patch_ctx->iova, patch_ctx->size);
2619         if (err)
2620                 goto err_free;
2621
2622         patch_ctx->gpu_va = gk20a_gmmu_map(ch_vm, &sgt, patch_ctx->size,
2623                                         0, gk20a_mem_flag_none);
2624         if (!patch_ctx->gpu_va)
2625                 goto err_free_sgtable;
2626
2627         gk20a_free_sgtable(&sgt);
2628
2629         gk20a_dbg_fn("done");
2630         return 0;
2631
2632  err_free_sgtable:
2633         gk20a_free_sgtable(&sgt);
2634  err_free:
2635         dma_free_attrs(d, patch_ctx->size,
2636                 patch_ctx->pages, patch_ctx->iova, &attrs);
2637         patch_ctx->pages = NULL;
2638         patch_ctx->iova = 0;
2639         gk20a_err(dev_from_gk20a(g), "fail");
2640         return err;
2641 }
2642
2643 static void gr_gk20a_unmap_channel_patch_ctx(struct channel_gk20a *c)
2644 {
2645         struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2646         struct vm_gk20a *ch_vm = c->vm;
2647
2648         gk20a_dbg_fn("");
2649
2650         if (patch_ctx->gpu_va)
2651                 gk20a_gmmu_unmap(ch_vm, patch_ctx->gpu_va,
2652                         patch_ctx->size, gk20a_mem_flag_none);
2653         patch_ctx->gpu_va = 0;
2654         patch_ctx->data_count = 0;
2655 }
2656
2657 static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c)
2658 {
2659         struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2660         struct gk20a *g = c->g;
2661         struct device *d = dev_from_gk20a(g);
2662         DEFINE_DMA_ATTRS(attrs);
2663
2664         gk20a_dbg_fn("");
2665
2666         gr_gk20a_unmap_channel_patch_ctx(c);
2667
2668         if (patch_ctx->pages) {
2669                 dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2670                 dma_free_attrs(d, patch_ctx->size,
2671                         patch_ctx->pages, patch_ctx->iova, &attrs);
2672                 patch_ctx->pages = NULL;
2673                 patch_ctx->iova = 0;
2674         }
2675 }
2676
2677 void gk20a_free_channel_ctx(struct channel_gk20a *c)
2678 {
2679         gr_gk20a_unmap_global_ctx_buffers(c);
2680         gr_gk20a_free_channel_patch_ctx(c);
2681         gr_gk20a_free_channel_gr_ctx(c);
2682
2683         /* zcull_ctx, pm_ctx */
2684
2685         memset(&c->ch_ctx, 0, sizeof(struct channel_ctx_gk20a));
2686
2687         c->num_objects = 0;
2688         c->first_init = false;
2689 }
2690
2691 static bool gr_gk20a_is_valid_class(struct gk20a *g, u32 class_num)
2692 {
2693         bool valid = false;
2694
2695         switch (class_num) {
2696         case KEPLER_COMPUTE_A:
2697         case KEPLER_C:
2698         case FERMI_TWOD_A:
2699         case KEPLER_DMA_COPY_A:
2700                 valid = true;
2701                 break;
2702
2703         default:
2704                 break;
2705         }
2706
2707         return valid;
2708 }
2709
2710 int gk20a_alloc_obj_ctx(struct channel_gk20a  *c,
2711                         struct nvhost_alloc_obj_ctx_args *args)
2712 {
2713         struct gk20a *g = c->g;
2714         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
2715         int err = 0;
2716
2717         gk20a_dbg_fn("");
2718
2719         /* an address space needs to have been bound at this point.*/
2720         if (!gk20a_channel_as_bound(c)) {
2721                 gk20a_err(dev_from_gk20a(g),
2722                            "not bound to address space at time"
2723                            " of grctx allocation");
2724                 return -EINVAL;
2725         }
2726
2727         if (!g->ops.gr.is_valid_class(g, args->class_num)) {
2728                 gk20a_err(dev_from_gk20a(g),
2729                            "invalid obj class 0x%x", args->class_num);
2730                 err = -EINVAL;
2731                 goto out;
2732         }
2733
2734         /* allocate gr ctx buffer */
2735         if (ch_ctx->gr_ctx.pages == NULL) {
2736                 err = gr_gk20a_alloc_channel_gr_ctx(g, c);
2737                 if (err) {
2738                         gk20a_err(dev_from_gk20a(g),
2739                                 "fail to allocate gr ctx buffer");
2740                         goto out;
2741                 }
2742                 c->obj_class = args->class_num;
2743         } else {
2744                 /*TBD: needs to be more subtle about which is being allocated
2745                 * as some are allowed to be allocated along same channel */
2746                 gk20a_err(dev_from_gk20a(g),
2747                         "too many classes alloc'd on same channel");
2748                 err = -EINVAL;
2749                 goto out;
2750         }
2751
2752         /* commit gr ctx buffer */
2753         err = gr_gk20a_commit_inst(c, ch_ctx->gr_ctx.gpu_va);
2754         if (err) {
2755                 gk20a_err(dev_from_gk20a(g),
2756                         "fail to commit gr ctx buffer");
2757                 goto out;
2758         }
2759
2760         /* allocate patch buffer */
2761         if (ch_ctx->patch_ctx.pages == NULL) {
2762                 err = gr_gk20a_alloc_channel_patch_ctx(g, c);
2763                 if (err) {
2764                         gk20a_err(dev_from_gk20a(g),
2765                                 "fail to allocate patch buffer");
2766                         goto out;
2767                 }
2768         }
2769
2770         /* map global buffer to channel gpu_va and commit */
2771         if (!ch_ctx->global_ctx_buffer_mapped) {
2772                 err = gr_gk20a_map_global_ctx_buffers(g, c);
2773                 if (err) {
2774                         gk20a_err(dev_from_gk20a(g),
2775                                 "fail to map global ctx buffer");
2776                         goto out;
2777                 }
2778                 gr_gk20a_elpg_protected_call(g,
2779                         gr_gk20a_commit_global_ctx_buffers(g, c, true));
2780         }
2781
2782         /* tweak any perf parameters per-context here */
2783         if (args->class_num == KEPLER_COMPUTE_A) {
2784                 int begin_err;
2785                 u32 tex_lock_disable_mask =
2786                         gr_gpcs_tpcs_sm_sch_texlock_tex_hash_m()         |
2787                         gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tile_m()    |
2788                         gr_gpcs_tpcs_sm_sch_texlock_tex_hash_phase_m()   |
2789                         gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tex_m()     |
2790                         gr_gpcs_tpcs_sm_sch_texlock_tex_hash_timeout_m() |
2791                         gr_gpcs_tpcs_sm_sch_texlock_dot_t_unlock_m();
2792
2793                 u32 texlock = gk20a_readl(g, gr_gpcs_tpcs_sm_sch_texlock_r());
2794
2795                 texlock = (texlock & ~tex_lock_disable_mask) |
2796                 (gr_gpcs_tpcs_sm_sch_texlock_tex_hash_disable_f()         |
2797                  gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tile_disable_f()    |
2798                  gr_gpcs_tpcs_sm_sch_texlock_tex_hash_phase_disable_f()   |
2799                  gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tex_disable_f()     |
2800                  gr_gpcs_tpcs_sm_sch_texlock_tex_hash_timeout_disable_f() |
2801                  gr_gpcs_tpcs_sm_sch_texlock_dot_t_unlock_disable_f());
2802
2803                 begin_err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
2804
2805                 if (!begin_err) {
2806                         err = gr_gk20a_ctx_patch_write(g, ch_ctx,
2807                                 gr_gpcs_tpcs_sm_sch_texlock_r(),
2808                                 texlock, true);
2809                 }
2810                 if ((begin_err || err)) {
2811                         gk20a_err(dev_from_gk20a(g),
2812                                    "failed to set texlock for compute class");
2813                 }
2814                 if (!begin_err)
2815                         gr_gk20a_ctx_patch_write_end(g, ch_ctx);
2816         }
2817
2818         /* init golden image, ELPG enabled after this is done */
2819         err = gr_gk20a_init_golden_ctx_image(g, c);
2820         if (err) {
2821                 gk20a_err(dev_from_gk20a(g),
2822                         "fail to init golden ctx image");
2823                 goto out;
2824         }
2825
2826         /* load golden image */
2827         if (!c->first_init) {
2828                 err = gr_gk20a_elpg_protected_call(g,
2829                         gr_gk20a_load_golden_ctx_image(g, c));
2830                 if (err) {
2831                         gk20a_err(dev_from_gk20a(g),
2832                                 "fail to load golden ctx image");
2833                         goto out;
2834                 }
2835                 c->first_init = true;
2836         }
2837
2838         c->num_objects++;
2839
2840         gk20a_dbg_fn("done");
2841         return 0;
2842 out:
2843         /* 1. gr_ctx, patch_ctx and global ctx buffer mapping
2844            can be reused so no need to release them.
2845            2. golden image init and load is a one time thing so if
2846            they pass, no need to undo. */
2847         gk20a_err(dev_from_gk20a(g), "fail");
2848         return err;
2849 }
2850
2851 int gk20a_free_obj_ctx(struct channel_gk20a  *c,
2852                        struct nvhost_free_obj_ctx_args *args)
2853 {
2854         unsigned long timeout = gk20a_get_gr_idle_timeout(c->g);
2855
2856         gk20a_dbg_fn("");
2857
2858         if (c->num_objects == 0)
2859                 return 0;
2860
2861         c->num_objects--;
2862
2863         if (c->num_objects == 0) {
2864                 c->first_init = false;
2865                 gk20a_disable_channel(c,
2866                         !c->has_timedout,
2867                         timeout);
2868                 gr_gk20a_unmap_channel_patch_ctx(c);
2869         }
2870
2871         return 0;
2872 }
2873
2874 static void gk20a_remove_gr_support(struct gr_gk20a *gr)
2875 {
2876         struct gk20a *g = gr->g;
2877         struct device *d = dev_from_gk20a(g);
2878         DEFINE_DMA_ATTRS(attrs);
2879
2880         gk20a_dbg_fn("");
2881
2882         gr_gk20a_free_global_ctx_buffers(g);
2883
2884         dma_free_coherent(d, gr->mmu_wr_mem.size,
2885                 gr->mmu_wr_mem.cpuva, gr->mmu_wr_mem.iova);
2886         gr->mmu_wr_mem.cpuva = NULL;
2887         gr->mmu_wr_mem.iova = 0;
2888         dma_free_coherent(d, gr->mmu_rd_mem.size,
2889                 gr->mmu_rd_mem.cpuva, gr->mmu_rd_mem.iova);
2890         gr->mmu_rd_mem.cpuva = NULL;
2891         gr->mmu_rd_mem.iova = 0;
2892
2893         dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2894         dma_free_attrs(d, gr->compbit_store.size, gr->compbit_store.pages,
2895                         gr->compbit_store.base_iova, &attrs);
2896
2897         memset(&gr->mmu_wr_mem, 0, sizeof(struct mmu_desc));
2898         memset(&gr->mmu_rd_mem, 0, sizeof(struct mmu_desc));
2899         memset(&gr->compbit_store, 0, sizeof(struct compbit_store_desc));
2900
2901         kfree(gr->gpc_tpc_count);
2902         kfree(gr->gpc_zcb_count);
2903         kfree(gr->gpc_ppc_count);
2904         kfree(gr->pes_tpc_count[0]);
2905         kfree(gr->pes_tpc_count[1]);
2906         kfree(gr->pes_tpc_mask[0]);
2907         kfree(gr->pes_tpc_mask[1]);
2908         kfree(gr->gpc_skip_mask);
2909         kfree(gr->map_tiles);
2910         gr->gpc_tpc_count = NULL;
2911         gr->gpc_zcb_count = NULL;
2912         gr->gpc_ppc_count = NULL;
2913         gr->pes_tpc_count[0] = NULL;
2914         gr->pes_tpc_count[1] = NULL;
2915         gr->pes_tpc_mask[0] = NULL;
2916         gr->pes_tpc_mask[1] = NULL;
2917         gr->gpc_skip_mask = NULL;
2918         gr->map_tiles = NULL;
2919
2920         kfree(gr->ctx_vars.ucode.fecs.inst.l);
2921         kfree(gr->ctx_vars.ucode.fecs.data.l);
2922         kfree(gr->ctx_vars.ucode.gpccs.inst.l);
2923         kfree(gr->ctx_vars.ucode.gpccs.data.l);
2924         kfree(gr->ctx_vars.sw_bundle_init.l);
2925         kfree(gr->ctx_vars.sw_method_init.l);
2926         kfree(gr->ctx_vars.sw_ctx_load.l);
2927         kfree(gr->ctx_vars.sw_non_ctx_load.l);
2928         kfree(gr->ctx_vars.ctxsw_regs.sys.l);
2929         kfree(gr->ctx_vars.ctxsw_regs.gpc.l);
2930         kfree(gr->ctx_vars.ctxsw_regs.tpc.l);
2931         kfree(gr->ctx_vars.ctxsw_regs.zcull_gpc.l);
2932         kfree(gr->ctx_vars.ctxsw_regs.ppc.l);
2933         kfree(gr->ctx_vars.ctxsw_regs.pm_sys.l);
2934         kfree(gr->ctx_vars.ctxsw_regs.pm_gpc.l);
2935         kfree(gr->ctx_vars.ctxsw_regs.pm_tpc.l);
2936
2937         kfree(gr->ctx_vars.local_golden_image);
2938         gr->ctx_vars.local_golden_image = NULL;
2939
2940         gk20a_allocator_destroy(&gr->comp_tags);
2941 }
2942
2943 static void gr_gk20a_bundle_cb_defaults(struct gk20a *g)
2944 {
2945         struct gr_gk20a *gr = &g->gr;
2946
2947         gr->bundle_cb_default_size =
2948                 gr_scc_bundle_cb_size_div_256b__prod_v();
2949         gr->min_gpm_fifo_depth =
2950                 gr_pd_ab_dist_cfg2_state_limit_min_gpm_fifo_depths_v();
2951         gr->bundle_cb_token_limit =
2952                 gr_pd_ab_dist_cfg2_token_limit_init_v();
2953 }
2954
2955 static int gr_gk20a_init_gr_config(struct gk20a *g, struct gr_gk20a *gr)
2956 {
2957         u32 gpc_index, pes_index;
2958         u32 pes_tpc_mask;
2959         u32 pes_tpc_count;
2960         u32 pes_heavy_index;
2961         u32 gpc_new_skip_mask;
2962         u32 tmp;
2963
2964         tmp = gk20a_readl(g, pri_ringmaster_enum_fbp_r());
2965         gr->num_fbps = pri_ringmaster_enum_fbp_count_v(tmp);
2966
2967         tmp = gk20a_readl(g, top_num_gpcs_r());
2968         gr->max_gpc_count = top_num_gpcs_value_v(tmp);
2969
2970         tmp = gk20a_readl(g, top_num_fbps_r());
2971         gr->max_fbps_count = top_num_fbps_value_v(tmp);
2972
2973         tmp = gk20a_readl(g, top_tpc_per_gpc_r());
2974         gr->max_tpc_per_gpc_count = top_tpc_per_gpc_value_v(tmp);
2975
2976         gr->max_tpc_count = gr->max_gpc_count * gr->max_tpc_per_gpc_count;
2977
2978         tmp = gk20a_readl(g, top_num_fbps_r());
2979         gr->sys_count = top_num_fbps_value_v(tmp);
2980
2981         tmp = gk20a_readl(g, pri_ringmaster_enum_gpc_r());
2982         gr->gpc_count = pri_ringmaster_enum_gpc_count_v(tmp);
2983
2984         gr->pe_count_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
2985         gr->max_zcull_per_gpc_count = proj_scal_litter_num_zcull_banks_v();
2986
2987         if (!gr->gpc_count) {
2988                 gk20a_err(dev_from_gk20a(g), "gpc_count==0!");
2989                 goto clean_up;
2990         }
2991
2992         gr->gpc_tpc_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2993         gr->gpc_zcb_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2994         gr->gpc_ppc_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2995         gr->pes_tpc_count[0] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2996         gr->pes_tpc_count[1] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2997         gr->pes_tpc_mask[0] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2998         gr->pes_tpc_mask[1] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2999         gr->gpc_skip_mask =
3000                 kzalloc(gr_pd_dist_skip_table__size_1_v() * 4 * sizeof(u32),
3001                         GFP_KERNEL);
3002
3003         if (!gr->gpc_tpc_count || !gr->gpc_zcb_count || !gr->gpc_ppc_count ||
3004             !gr->pes_tpc_count[0] || !gr->pes_tpc_count[1] ||
3005             !gr->pes_tpc_mask[0] || !gr->pes_tpc_mask[1] || !gr->gpc_skip_mask)
3006                 goto clean_up;
3007
3008         gr->ppc_count = 0;
3009         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3010                 tmp = gk20a_readl(g, gr_gpc0_fs_gpc_r());
3011
3012                 gr->gpc_tpc_count[gpc_index] =
3013                         gr_gpc0_fs_gpc_num_available_tpcs_v(tmp);
3014                 gr->tpc_count += gr->gpc_tpc_count[gpc_index];
3015
3016                 gr->gpc_zcb_count[gpc_index] =
3017                         gr_gpc0_fs_gpc_num_available_zculls_v(tmp);
3018                 gr->zcb_count += gr->gpc_zcb_count[gpc_index];
3019
3020                 gr->gpc_ppc_count[gpc_index] = gr->pe_count_per_gpc;
3021                 gr->ppc_count += gr->gpc_ppc_count[gpc_index];
3022                 for (pes_index = 0; pes_index < gr->pe_count_per_gpc; pes_index++) {
3023
3024                         tmp = gk20a_readl(g,
3025                                 gr_gpc0_gpm_pd_pes_tpc_id_mask_r(pes_index) +
3026                                 gpc_index * proj_gpc_stride_v());
3027
3028                         pes_tpc_mask = gr_gpc0_gpm_pd_pes_tpc_id_mask_mask_v(tmp);
3029                         pes_tpc_count = count_bits(pes_tpc_mask);
3030
3031                         gr->pes_tpc_count[pes_index][gpc_index] = pes_tpc_count;
3032                         gr->pes_tpc_mask[pes_index][gpc_index] = pes_tpc_mask;
3033                 }
3034
3035                 gpc_new_skip_mask = 0;
3036                 if (gr->pes_tpc_count[0][gpc_index] +
3037                     gr->pes_tpc_count[1][gpc_index] == 5) {
3038                         pes_heavy_index =
3039                                 gr->pes_tpc_count[0][gpc_index] >
3040                                 gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
3041
3042                         gpc_new_skip_mask =
3043                                 gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
3044                                    (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
3045                                    (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
3046
3047                 } else if ((gr->pes_tpc_count[0][gpc_index] +
3048                             gr->pes_tpc_count[1][gpc_index] == 4) &&
3049                            (gr->pes_tpc_count[0][gpc_index] !=
3050                             gr->pes_tpc_count[1][gpc_index])) {
3051                                 pes_heavy_index =
3052                                     gr->pes_tpc_count[0][gpc_index] >
3053                                     gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
3054
3055                         gpc_new_skip_mask =
3056                                 gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
3057                                    (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
3058                                    (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
3059                 }
3060                 gr->gpc_skip_mask[gpc_index] = gpc_new_skip_mask;
3061         }
3062
3063         gk20a_dbg_info("fbps: %d", gr->num_fbps);
3064         gk20a_dbg_info("max_gpc_count: %d", gr->max_gpc_count);
3065         gk20a_dbg_info("max_fbps_count: %d", gr->max_fbps_count);
3066         gk20a_dbg_info("max_tpc_per_gpc_count: %d", gr->max_tpc_per_gpc_count);
3067         gk20a_dbg_info("max_zcull_per_gpc_count: %d", gr->max_zcull_per_gpc_count);
3068         gk20a_dbg_info("max_tpc_count: %d", gr->max_tpc_count);
3069         gk20a_dbg_info("sys_count: %d", gr->sys_count);
3070         gk20a_dbg_info("gpc_count: %d", gr->gpc_count);
3071         gk20a_dbg_info("pe_count_per_gpc: %d", gr->pe_count_per_gpc);
3072         gk20a_dbg_info("tpc_count: %d", gr->tpc_count);
3073         gk20a_dbg_info("ppc_count: %d", gr->ppc_count);
3074
3075         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3076                 gk20a_dbg_info("gpc_tpc_count[%d] : %d",
3077                            gpc_index, gr->gpc_tpc_count[gpc_index]);
3078         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3079                 gk20a_dbg_info("gpc_zcb_count[%d] : %d",
3080                            gpc_index, gr->gpc_zcb_count[gpc_index]);
3081         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3082                 gk20a_dbg_info("gpc_ppc_count[%d] : %d",
3083                            gpc_index, gr->gpc_ppc_count[gpc_index]);
3084         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3085                 gk20a_dbg_info("gpc_skip_mask[%d] : %d",
3086                            gpc_index, gr->gpc_skip_mask[gpc_index]);
3087         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3088                 for (pes_index = 0;
3089                      pes_index < gr->pe_count_per_gpc;
3090                      pes_index++)
3091                         gk20a_dbg_info("pes_tpc_count[%d][%d] : %d",
3092                                    pes_index, gpc_index,
3093                                    gr->pes_tpc_count[pes_index][gpc_index]);
3094
3095         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3096                 for (pes_index = 0;
3097                      pes_index < gr->pe_count_per_gpc;
3098                      pes_index++)
3099                         gk20a_dbg_info("pes_tpc_mask[%d][%d] : %d",
3100                                    pes_index, gpc_index,
3101                                    gr->pes_tpc_mask[pes_index][gpc_index]);
3102
3103         g->ops.gr.bundle_cb_defaults(g);
3104         g->ops.gr.cb_size_default(g);
3105         g->ops.gr.calc_global_ctx_buffer_size(g);
3106         gr->timeslice_mode = gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v();
3107
3108         gk20a_dbg_info("bundle_cb_default_size: %d",
3109                    gr->bundle_cb_default_size);
3110         gk20a_dbg_info("min_gpm_fifo_depth: %d", gr->min_gpm_fifo_depth);
3111         gk20a_dbg_info("bundle_cb_token_limit: %d", gr->bundle_cb_token_limit);
3112         gk20a_dbg_info("attrib_cb_default_size: %d",
3113                    gr->attrib_cb_default_size);
3114         gk20a_dbg_info("attrib_cb_size: %d", gr->attrib_cb_size);
3115         gk20a_dbg_info("alpha_cb_default_size: %d", gr->alpha_cb_default_size);
3116         gk20a_dbg_info("alpha_cb_size: %d", gr->alpha_cb_size);
3117         gk20a_dbg_info("timeslice_mode: %d", gr->timeslice_mode);
3118
3119         return 0;
3120
3121 clean_up:
3122         return -ENOMEM;
3123 }
3124
3125 static int gr_gk20a_init_mmu_sw(struct gk20a *g, struct gr_gk20a *gr)
3126 {
3127         struct device *d = dev_from_gk20a(g);
3128         dma_addr_t iova;
3129
3130         gr->mmu_wr_mem_size = gr->mmu_rd_mem_size = 0x1000;
3131
3132         gr->mmu_wr_mem.size = gr->mmu_wr_mem_size;
3133         gr->mmu_wr_mem.cpuva = dma_zalloc_coherent(d, gr->mmu_wr_mem_size,
3134                                         &iova, GFP_KERNEL);
3135         if (!gr->mmu_wr_mem.cpuva)
3136                 goto err;
3137
3138         gr->mmu_wr_mem.iova = iova;
3139
3140         gr->mmu_rd_mem.size = gr->mmu_rd_mem_size;
3141         gr->mmu_rd_mem.cpuva = dma_zalloc_coherent(d, gr->mmu_rd_mem_size,
3142                                         &iova, GFP_KERNEL);
3143         if (!gr->mmu_rd_mem.cpuva)
3144                 goto err_free_wr_mem;
3145
3146         gr->mmu_rd_mem.iova = iova;
3147         return 0;
3148
3149  err_free_wr_mem:
3150         dma_free_coherent(d, gr->mmu_wr_mem.size,
3151                 gr->mmu_wr_mem.cpuva, gr->mmu_wr_mem.iova);
3152         gr->mmu_wr_mem.cpuva = NULL;
3153         gr->mmu_wr_mem.iova = 0;
3154  err:
3155         return -ENOMEM;
3156 }
3157
3158 static u32 prime_set[18] = {
3159         2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61 };
3160
3161 static int gr_gk20a_init_map_tiles(struct gk20a *g, struct gr_gk20a *gr)
3162 {
3163         s32 comm_denom;
3164         s32 mul_factor;
3165         s32 *init_frac = NULL;
3166         s32 *init_err = NULL;
3167         s32 *run_err = NULL;
3168         s32 *sorted_num_tpcs = NULL;
3169         s32 *sorted_to_unsorted_gpc_map = NULL;
3170         u32 gpc_index;
3171         u32 gpc_mark = 0;
3172         u32 num_tpc;
3173         u32 max_tpc_count = 0;
3174         u32 swap;
3175         u32 tile_count;
3176         u32 index;
3177         bool delete_map = false;
3178         bool gpc_sorted;
3179         int ret = 0;
3180
3181         init_frac = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3182         init_err = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3183         run_err = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3184         sorted_num_tpcs =
3185                 kzalloc(proj_scal_max_gpcs_v() *
3186                         proj_scal_max_tpc_per_gpc_v() * sizeof(s32),
3187                         GFP_KERNEL);
3188         sorted_to_unsorted_gpc_map =
3189                 kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3190
3191         if (!(init_frac && init_err && run_err && sorted_num_tpcs &&
3192               sorted_to_unsorted_gpc_map)) {
3193                 ret = -ENOMEM;
3194                 goto clean_up;
3195         }
3196
3197         gr->map_row_offset = INVALID_SCREEN_TILE_ROW_OFFSET;
3198
3199         if (gr->tpc_count == 3)
3200                 gr->map_row_offset = 2;
3201         else if (gr->tpc_count < 3)
3202                 gr->map_row_offset = 1;
3203         else {
3204                 gr->map_row_offset = 3;
3205
3206                 for (index = 1; index < 18; index++) {
3207                         u32 prime = prime_set[index];
3208                         if ((gr->tpc_count % prime) != 0) {
3209                                 gr->map_row_offset = prime;
3210                                 break;
3211                         }
3212                 }
3213         }
3214
3215         switch (gr->tpc_count) {
3216         case 15:
3217                 gr->map_row_offset = 6;
3218                 break;
3219         case 14:
3220                 gr->map_row_offset = 5;
3221                 break;
3222         case 13:
3223                 gr->map_row_offset = 2;
3224                 break;
3225         case 11:
3226                 gr->map_row_offset = 7;
3227                 break;
3228         case 10:
3229                 gr->map_row_offset = 6;
3230                 break;
3231         case 7:
3232         case 5:
3233                 gr->map_row_offset = 1;
3234                 break;
3235         default:
3236                 break;
3237         }
3238
3239         if (gr->map_tiles) {
3240                 if (gr->map_tile_count != gr->tpc_count)
3241                         delete_map = true;
3242
3243                 for (tile_count = 0; tile_count < gr->map_tile_count; tile_count++) {
3244                         if ((u32)gr->map_tiles[tile_count] >= gr->tpc_count)
3245                                 delete_map = true;
3246                 }
3247
3248                 if (delete_map) {
3249                         kfree(gr->map_tiles);
3250                         gr->map_tiles = NULL;
3251                         gr->map_tile_count = 0;
3252                 }
3253         }
3254
3255         if (gr->map_tiles == NULL) {
3256                 gr->map_tile_count = proj_scal_max_gpcs_v();
3257
3258                 gr->map_tiles = kzalloc(proj_scal_max_gpcs_v() * sizeof(u8), GFP_KERNEL);
3259                 if (gr->map_tiles == NULL) {
3260                         ret = -ENOMEM;
3261                         goto clean_up;
3262                 }
3263
3264                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3265                         sorted_num_tpcs[gpc_index] = gr->gpc_tpc_count[gpc_index];
3266                         sorted_to_unsorted_gpc_map[gpc_index] = gpc_index;
3267                 }
3268
3269                 gpc_sorted = false;
3270                 while (!gpc_sorted) {
3271                         gpc_sorted = true;
3272                         for (gpc_index = 0; gpc_index < gr->gpc_count - 1; gpc_index++) {
3273                                 if (sorted_num_tpcs[gpc_index + 1] > sorted_num_tpcs[gpc_index]) {
3274                                         gpc_sorted = false;
3275                                         swap = sorted_num_tpcs[gpc_index];
3276                                         sorted_num_tpcs[gpc_index] = sorted_num_tpcs[gpc_index + 1];
3277                                         sorted_num_tpcs[gpc_index + 1] = swap;
3278                                         swap = sorted_to_unsorted_gpc_map[gpc_index];
3279                                         sorted_to_unsorted_gpc_map[gpc_index] =
3280                                                 sorted_to_unsorted_gpc_map[gpc_index + 1];
3281                                         sorted_to_unsorted_gpc_map[gpc_index + 1] = swap;
3282                                 }
3283                         }
3284                 }
3285
3286                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3287                         if (gr->gpc_tpc_count[gpc_index] > max_tpc_count)
3288                                 max_tpc_count = gr->gpc_tpc_count[gpc_index];
3289
3290                 mul_factor = gr->gpc_count * max_tpc_count;
3291                 if (mul_factor & 0x1)
3292                         mul_factor = 2;
3293                 else
3294                         mul_factor = 1;
3295
3296                 comm_denom = gr->gpc_count * max_tpc_count * mul_factor;
3297
3298                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3299                         num_tpc = sorted_num_tpcs[gpc_index];
3300
3301                         init_frac[gpc_index] = num_tpc * gr->gpc_count * mul_factor;
3302
3303                         if (num_tpc != 0)
3304                                 init_err[gpc_index] = gpc_index * max_tpc_count * mul_factor - comm_denom/2;
3305                         else
3306                                 init_err[gpc_index] = 0;
3307
3308                         run_err[gpc_index] = init_frac[gpc_index] + init_err[gpc_index];
3309                 }
3310
3311                 while (gpc_mark < gr->tpc_count) {
3312                         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3313                                 if ((run_err[gpc_index] * 2) >= comm_denom) {
3314                                         gr->map_tiles[gpc_mark++] = (u8)sorted_to_unsorted_gpc_map[gpc_index];
3315                                         run_err[gpc_index] += init_frac[gpc_index] - comm_denom;
3316                                 } else
3317                                         run_err[gpc_index] += init_frac[gpc_index];
3318                         }
3319                 }
3320         }
3321
3322 clean_up:
3323         kfree(init_frac);
3324         kfree(init_err);
3325         kfree(run_err);
3326         kfree(sorted_num_tpcs);
3327         kfree(sorted_to_unsorted_gpc_map);
3328
3329         if (ret)
3330                 gk20a_err(dev_from_gk20a(g), "fail");
3331         else
3332                 gk20a_dbg_fn("done");
3333
3334         return ret;
3335 }
3336
3337 static int gr_gk20a_init_zcull(struct gk20a *g, struct gr_gk20a *gr)
3338 {
3339         struct gr_zcull_gk20a *zcull = &gr->zcull;
3340
3341         zcull->aliquot_width = gr->tpc_count * 16;
3342         zcull->aliquot_height = 16;
3343
3344         zcull->width_align_pixels = gr->tpc_count * 16;
3345         zcull->height_align_pixels = 32;
3346
3347         zcull->aliquot_size =
3348                 zcull->aliquot_width * zcull->aliquot_height;
3349
3350         /* assume no floor sweeping since we only have 1 tpc in 1 gpc */
3351         zcull->pixel_squares_by_aliquots =
3352                 gr->zcb_count * 16 * 16 * gr->tpc_count /
3353                 (gr->gpc_count * gr->gpc_tpc_count[0]);
3354
3355         zcull->total_aliquots =
3356                 gr_gpc0_zcull_total_ram_size_num_aliquots_f(
3357                         gk20a_readl(g, gr_gpc0_zcull_total_ram_size_r()));
3358
3359         return 0;
3360 }
3361
3362 u32 gr_gk20a_get_ctxsw_zcull_size(struct gk20a *g, struct gr_gk20a *gr)
3363 {
3364         /* assuming gr has already been initialized */
3365         return gr->ctx_vars.zcull_ctxsw_image_size;
3366 }
3367
3368 int gr_gk20a_bind_ctxsw_zcull(struct gk20a *g, struct gr_gk20a *gr,
3369                         struct channel_gk20a *c, u64 zcull_va, u32 mode)
3370 {
3371         struct zcull_ctx_desc *zcull_ctx = &c->ch_ctx.zcull_ctx;
3372
3373         zcull_ctx->ctx_sw_mode = mode;
3374         zcull_ctx->gpu_va = zcull_va;
3375
3376         /* TBD: don't disable channel in sw method processing */
3377         return gr_gk20a_ctx_zcull_setup(g, c, true);
3378 }
3379
3380 int gr_gk20a_get_zcull_info(struct gk20a *g, struct gr_gk20a *gr,
3381                         struct gr_zcull_info *zcull_params)
3382 {
3383         struct gr_zcull_gk20a *zcull = &gr->zcull;
3384
3385         zcull_params->width_align_pixels = zcull->width_align_pixels;
3386         zcull_params->height_align_pixels = zcull->height_align_pixels;
3387         zcull_params->pixel_squares_by_aliquots =
3388                 zcull->pixel_squares_by_aliquots;
3389         zcull_params->aliquot_total = zcull->total_aliquots;
3390
3391         zcull_params->region_byte_multiplier =
3392                 gr->gpc_count * gr_zcull_bytes_per_aliquot_per_gpu_v();
3393         zcull_params->region_header_size =
3394                 proj_scal_litter_num_gpcs_v() *
3395                 gr_zcull_save_restore_header_bytes_per_gpc_v();
3396
3397         zcull_params->subregion_header_size =
3398                 proj_scal_litter_num_gpcs_v() *
3399                 gr_zcull_save_restore_subregion_header_bytes_per_gpc_v();
3400
3401         zcull_params->subregion_width_align_pixels =
3402                 gr->tpc_count * gr_gpc0_zcull_zcsize_width_subregion__multiple_v();
3403         zcull_params->subregion_height_align_pixels =
3404                 gr_gpc0_zcull_zcsize_height_subregion__multiple_v();
3405         zcull_params->subregion_count = gr_zcull_subregion_qty_v();
3406
3407         return 0;
3408 }
3409
3410 static int gr_gk20a_add_zbc_color(struct gk20a *g, struct gr_gk20a *gr,
3411                                   struct zbc_entry *color_val, u32 index)
3412 {
3413         struct fifo_gk20a *f = &g->fifo;
3414         struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
3415         u32 i;
3416         unsigned long end_jiffies = jiffies +
3417                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3418         u32 ret;
3419
3420         ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3421         if (ret) {
3422                 gk20a_err(dev_from_gk20a(g),
3423                         "failed to disable gr engine activity\n");
3424                 return ret;
3425         }
3426
3427         ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
3428         if (ret) {
3429                 gk20a_err(dev_from_gk20a(g),
3430                         "failed to idle graphics\n");
3431                 goto clean_up;
3432         }
3433
3434         /* update l2 table */
3435         g->ops.ltc.set_zbc_color_entry(g, color_val, index);
3436
3437         /* update ds table */
3438         gk20a_writel(g, gr_ds_zbc_color_r_r(),
3439                 gr_ds_zbc_color_r_val_f(color_val->color_ds[0]));
3440         gk20a_writel(g, gr_ds_zbc_color_g_r(),
3441                 gr_ds_zbc_color_g_val_f(color_val->color_ds[1]));
3442         gk20a_writel(g, gr_ds_zbc_color_b_r(),
3443                 gr_ds_zbc_color_b_val_f(color_val->color_ds[2]));
3444         gk20a_writel(g, gr_ds_zbc_color_a_r(),
3445                 gr_ds_zbc_color_a_val_f(color_val->color_ds[3]));
3446
3447         gk20a_writel(g, gr_ds_zbc_color_fmt_r(),
3448                 gr_ds_zbc_color_fmt_val_f(color_val->format));
3449
3450         gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3451                 gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
3452
3453         /* trigger the write */
3454         gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3455                 gr_ds_zbc_tbl_ld_select_c_f() |
3456                 gr_ds_zbc_tbl_ld_action_write_f() |
3457                 gr_ds_zbc_tbl_ld_trigger_active_f());
3458
3459         /* update local copy */
3460         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3461                 gr->zbc_col_tbl[index].color_l2[i] = color_val->color_l2[i];
3462                 gr->zbc_col_tbl[index].color_ds[i] = color_val->color_ds[i];
3463         }
3464         gr->zbc_col_tbl[index].format = color_val->format;
3465         gr->zbc_col_tbl[index].ref_cnt++;
3466
3467 clean_up:
3468         ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3469         if (ret) {
3470                 gk20a_err(dev_from_gk20a(g),
3471                         "failed to enable gr engine activity\n");
3472         }
3473
3474         return ret;
3475 }
3476
3477 static int gr_gk20a_add_zbc_depth(struct gk20a *g, struct gr_gk20a *gr,
3478                                 struct zbc_entry *depth_val, u32 index)
3479 {
3480         struct fifo_gk20a *f = &g->fifo;
3481         struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
3482         unsigned long end_jiffies = jiffies +
3483                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3484         u32 ret;
3485
3486         ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3487         if (ret) {
3488                 gk20a_err(dev_from_gk20a(g),
3489                         "failed to disable gr engine activity\n");
3490                 return ret;
3491         }
3492
3493         ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
3494         if (ret) {
3495                 gk20a_err(dev_from_gk20a(g),
3496                         "failed to idle graphics\n");
3497                 goto clean_up;
3498         }
3499
3500         /* update l2 table */
3501         g->ops.ltc.set_zbc_depth_entry(g, depth_val, index);
3502
3503         /* update ds table */
3504         gk20a_writel(g, gr_ds_zbc_z_r(),
3505                 gr_ds_zbc_z_val_f(depth_val->depth));
3506
3507         gk20a_writel(g, gr_ds_zbc_z_fmt_r(),
3508                 gr_ds_zbc_z_fmt_val_f(depth_val->format));
3509
3510         gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3511                 gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
3512
3513         /* trigger the write */
3514         gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3515                 gr_ds_zbc_tbl_ld_select_z_f() |
3516                 gr_ds_zbc_tbl_ld_action_write_f() |
3517                 gr_ds_zbc_tbl_ld_trigger_active_f());
3518
3519         /* update local copy */
3520         gr->zbc_dep_tbl[index].depth = depth_val->depth;
3521         gr->zbc_dep_tbl[index].format = depth_val->format;
3522         gr->zbc_dep_tbl[index].ref_cnt++;
3523
3524 clean_up:
3525         ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3526         if (ret) {
3527                 gk20a_err(dev_from_gk20a(g),
3528                         "failed to enable gr engine activity\n");
3529         }
3530
3531         return ret;
3532 }
3533
3534 void gr_gk20a_pmu_save_zbc(struct gk20a *g, u32 entries)
3535 {
3536         struct fifo_gk20a *f = &g->fifo;
3537         struct fifo_engine_info_gk20a *gr_info =
3538                 f->engine_info + ENGINE_GR_GK20A;
3539         unsigned long end_jiffies = jiffies +
3540                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3541         u32 ret;
3542
3543         ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3544         if (ret) {
3545                 gk20a_err(dev_from_gk20a(g),
3546                         "failed to disable gr engine activity\n");
3547                 return;
3548         }
3549
3550         ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
3551         if (ret) {
3552                 gk20a_err(dev_from_gk20a(g),
3553                         "failed to idle graphics\n");
3554                 goto clean_up;
3555         }
3556
3557         /* update zbc */
3558         gk20a_pmu_save_zbc(g, entries);
3559
3560 clean_up:
3561         ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3562         if (ret) {
3563                 gk20a_err(dev_from_gk20a(g),
3564                         "failed to enable gr engine activity\n");
3565         }
3566
3567         return;
3568 }
3569
3570 int gr_gk20a_add_zbc(struct gk20a *g, struct gr_gk20a *gr,
3571                      struct zbc_entry *zbc_val)
3572 {
3573         struct zbc_color_table *c_tbl;
3574         struct zbc_depth_table *d_tbl;
3575         u32 i, ret = -ENOMEM;
3576         bool added = false;
3577         u32 entries;
3578
3579         /* no endian swap ? */
3580
3581         mutex_lock(&gr->zbc_lock);
3582         switch (zbc_val->type) {
3583         case GK20A_ZBC_TYPE_COLOR:
3584                 /* search existing tables */
3585                 for (i = 0; i < gr->max_used_color_index; i++) {
3586
3587                         c_tbl = &gr->zbc_col_tbl[i];
3588
3589                         if (c_tbl->ref_cnt && c_tbl->format == zbc_val->format &&
3590                             memcmp(c_tbl->color_ds, zbc_val->color_ds,
3591                                 sizeof(zbc_val->color_ds)) == 0) {
3592
3593                                 if (memcmp(c_tbl->color_l2, zbc_val->color_l2,
3594                                     sizeof(zbc_val->color_l2))) {
3595                                         gk20a_err(dev_from_gk20a(g),
3596                                                 "zbc l2 and ds color don't match with existing entries");
3597                                         ret = -EINVAL;
3598                                         goto err_mutex;
3599                                 }
3600                                 added = true;
3601                                 c_tbl->ref_cnt++;
3602                                 ret = 0;
3603                                 break;
3604                         }
3605                 }
3606                 /* add new table */
3607                 if (!added &&
3608                     gr->max_used_color_index < GK20A_ZBC_TABLE_SIZE) {
3609
3610                         c_tbl =
3611                             &gr->zbc_col_tbl[gr->max_used_color_index];
3612                         WARN_ON(c_tbl->ref_cnt != 0);
3613
3614                         ret = gr_gk20a_add_zbc_color(g, gr,
3615                                 zbc_val, gr->max_used_color_index);
3616
3617                         if (!ret)
3618                                 gr->max_used_color_index++;
3619                 }
3620                 break;
3621         case GK20A_ZBC_TYPE_DEPTH:
3622                 /* search existing tables */
3623                 for (i = 0; i < gr->max_used_depth_index; i++) {
3624
3625                         d_tbl = &gr->zbc_dep_tbl[i];
3626
3627                         if (d_tbl->ref_cnt &&
3628                             d_tbl->depth == zbc_val->depth &&
3629                             d_tbl->format == zbc_val->format) {
3630                                 added = true;
3631                                 d_tbl->ref_cnt++;
3632                                 ret = 0;
3633                                 break;
3634                         }
3635                 }
3636                 /* add new table */
3637                 if (!added &&
3638                     gr->max_used_depth_index < GK20A_ZBC_TABLE_SIZE) {
3639
3640                         d_tbl =
3641                             &gr->zbc_dep_tbl[gr->max_used_depth_index];
3642                         WARN_ON(d_tbl->ref_cnt != 0);
3643
3644                         ret = gr_gk20a_add_zbc_depth(g, gr,
3645                                 zbc_val, gr->max_used_depth_index);
3646
3647                         if (!ret)
3648                                 gr->max_used_depth_index++;
3649                 }
3650                 break;
3651         default:
3652                 gk20a_err(dev_from_gk20a(g),
3653                         "invalid zbc table type %d", zbc_val->type);
3654                 ret = -EINVAL;
3655                 goto err_mutex;
3656         }
3657
3658         if (!added && ret == 0) {
3659                 /* update zbc for elpg only when new entry is added */
3660                 entries = max(gr->max_used_color_index,
3661                                         gr->max_used_depth_index);
3662                 gr_gk20a_pmu_save_zbc(g, entries);
3663         }
3664
3665 err_mutex:
3666         mutex_unlock(&gr->zbc_lock);
3667         return ret;
3668 }
3669
3670 /* get a zbc table entry specified by index
3671  * return table size when type is invalid */
3672 int gr_gk20a_query_zbc(struct gk20a *g, struct gr_gk20a *gr,
3673                         struct zbc_query_params *query_params)
3674 {
3675         u32 index = query_params->index_size;
3676         u32 i;
3677
3678         switch (query_params->type) {
3679         case GK20A_ZBC_TYPE_INVALID:
3680                 query_params->index_size = GK20A_ZBC_TABLE_SIZE;
3681                 break;
3682         case GK20A_ZBC_TYPE_COLOR:
3683                 if (index >= GK20A_ZBC_TABLE_SIZE) {
3684                         gk20a_err(dev_from_gk20a(g),
3685                                 "invalid zbc color table index\n");
3686                         return -EINVAL;
3687                 }
3688                 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3689                         query_params->color_l2[i] =
3690                                 gr->zbc_col_tbl[index].color_l2[i];
3691                         query_params->color_ds[i] =
3692                                 gr->zbc_col_tbl[index].color_ds[i];
3693                 }
3694                 query_params->format = gr->zbc_col_tbl[index].format;
3695                 query_params->ref_cnt = gr->zbc_col_tbl[index].ref_cnt;
3696                 break;
3697         case GK20A_ZBC_TYPE_DEPTH:
3698                 if (index >= GK20A_ZBC_TABLE_SIZE) {
3699                         gk20a_err(dev_from_gk20a(g),
3700                                 "invalid zbc depth table index\n");
3701                         return -EINVAL;
3702                 }
3703                 query_params->depth = gr->zbc_dep_tbl[index].depth;
3704                 query_params->format = gr->zbc_dep_tbl[index].format;
3705                 query_params->ref_cnt = gr->zbc_dep_tbl[index].ref_cnt;
3706                 break;
3707         default:
3708                 gk20a_err(dev_from_gk20a(g),
3709                                 "invalid zbc table type\n");
3710                 return -EINVAL;
3711         }
3712
3713         return 0;
3714 }
3715
3716 int gr_gk20a_load_zbc_table(struct gk20a *g, struct gr_gk20a *gr)
3717 {
3718         int i, ret;
3719
3720         mutex_init(&gr->zbc_lock);
3721         for (i = 0; i < gr->max_used_color_index; i++) {
3722                 struct zbc_color_table *c_tbl = &gr->zbc_col_tbl[i];
3723                 struct zbc_entry zbc_val;
3724
3725                 zbc_val.type = GK20A_ZBC_TYPE_COLOR;
3726                 memcpy(zbc_val.color_ds,
3727                        c_tbl->color_ds, sizeof(zbc_val.color_ds));
3728                 memcpy(zbc_val.color_l2,
3729                        c_tbl->color_l2, sizeof(zbc_val.color_l2));
3730                 zbc_val.format = c_tbl->format;
3731
3732                 ret = gr_gk20a_add_zbc_color(g, gr, &zbc_val, i);
3733
3734                 if (ret)
3735                         return ret;
3736         }
3737         for (i = 0; i < gr->max_used_depth_index; i++) {
3738                 struct zbc_depth_table *d_tbl = &gr->zbc_dep_tbl[i];
3739                 struct zbc_entry zbc_val;
3740
3741                 zbc_val.type = GK20A_ZBC_TYPE_DEPTH;
3742                 zbc_val.depth = d_tbl->depth;
3743                 zbc_val.format = d_tbl->format;
3744
3745                 ret = gr_gk20a_add_zbc_depth(g, gr, &zbc_val, i);
3746                 if (ret)
3747                         return ret;
3748         }
3749         return 0;
3750 }
3751
3752 int gr_gk20a_load_zbc_default_table(struct gk20a *g, struct gr_gk20a *gr)
3753 {
3754         struct zbc_entry zbc_val;
3755         u32 i, err;
3756
3757         /* load default color table */
3758         zbc_val.type = GK20A_ZBC_TYPE_COLOR;
3759
3760         zbc_val.format = gr_ds_zbc_color_fmt_val_zero_v();
3761         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3762                 zbc_val.color_ds[i] = 0;
3763                 zbc_val.color_l2[i] = 0;
3764         }
3765         err = gr_gk20a_add_zbc(g, gr, &zbc_val);
3766
3767         zbc_val.format = gr_ds_zbc_color_fmt_val_unorm_one_v();
3768         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3769                 zbc_val.color_ds[i] = 0xffffffff;
3770                 zbc_val.color_l2[i] = 0x3f800000;
3771         }
3772         err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3773
3774         zbc_val.format = gr_ds_zbc_color_fmt_val_rf32_gf32_bf32_af32_v();
3775         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3776                 zbc_val.color_ds[i] = 0;
3777                 zbc_val.color_l2[i] = 0;
3778         }
3779         err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3780
3781         zbc_val.format = gr_ds_zbc_color_fmt_val_rf32_gf32_bf32_af32_v();
3782         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3783                 zbc_val.color_ds[i] = 0x3f800000;
3784                 zbc_val.color_l2[i] = 0x3f800000;
3785         }
3786         err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3787
3788         if (!err)
3789                 gr->max_default_color_index = 4;
3790         else {
3791                 gk20a_err(dev_from_gk20a(g),
3792                            "fail to load default zbc color table\n");
3793                 return err;
3794         }
3795
3796         /* load default depth table */
3797         zbc_val.type = GK20A_ZBC_TYPE_DEPTH;
3798
3799         zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
3800         zbc_val.depth = 0;
3801         err = gr_gk20a_add_zbc(g, gr, &zbc_val);
3802
3803         zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
3804         zbc_val.depth = 0x3f800000;
3805         err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3806
3807         if (!err)
3808                 gr->max_default_depth_index = 2;
3809         else {
3810                 gk20a_err(dev_from_gk20a(g),
3811                            "fail to load default zbc depth table\n");
3812                 return err;
3813         }
3814
3815         return 0;
3816 }
3817
3818 int gk20a_gr_zbc_set_table(struct gk20a *g, struct gr_gk20a *gr,
3819                         struct zbc_entry *zbc_val)
3820 {
3821         gk20a_dbg_fn("");
3822
3823         return gr_gk20a_elpg_protected_call(g,
3824                 gr_gk20a_add_zbc(g, gr, zbc_val));
3825 }
3826
3827 void gr_gk20a_init_blcg_mode(struct gk20a *g, u32 mode, u32 engine)
3828 {
3829         u32 gate_ctrl;
3830
3831         gate_ctrl = gk20a_readl(g, therm_gate_ctrl_r(engine));
3832
3833         switch (mode) {
3834         case BLCG_RUN:
3835                 gate_ctrl = set_field(gate_ctrl,
3836                                 therm_gate_ctrl_blk_clk_m(),
3837                                 therm_gate_ctrl_blk_clk_run_f());
3838                 break;
3839         case BLCG_AUTO:
3840                 gate_ctrl = set_field(gate_ctrl,
3841                                 therm_gate_ctrl_blk_clk_m(),
3842                                 therm_gate_ctrl_blk_clk_auto_f());
3843                 break;
3844         default:
3845                 gk20a_err(dev_from_gk20a(g),
3846                         "invalid blcg mode %d", mode);
3847                 return;
3848         }
3849
3850         gk20a_writel(g, therm_gate_ctrl_r(engine), gate_ctrl);
3851 }
3852
3853 void gr_gk20a_init_elcg_mode(struct gk20a *g, u32 mode, u32 engine)
3854 {
3855         u32 gate_ctrl, idle_filter;
3856
3857         gate_ctrl = gk20a_readl(g, therm_gate_ctrl_r(engine));
3858
3859         switch (mode) {
3860         case ELCG_RUN:
3861                 gate_ctrl = set_field(gate_ctrl,
3862                                 therm_gate_ctrl_eng_clk_m(),
3863                                 therm_gate_ctrl_eng_clk_run_f());
3864                 gate_ctrl = set_field(gate_ctrl,
3865                                 therm_gate_ctrl_eng_pwr_m(),
3866                                 /* set elpg to auto to meet hw expectation */
3867                                 therm_gate_ctrl_eng_pwr_auto_f());
3868                 break;
3869         case ELCG_STOP:
3870                 gate_ctrl = set_field(gate_ctrl,
3871                                 therm_gate_ctrl_eng_clk_m(),
3872                                 therm_gate_ctrl_eng_clk_stop_f());
3873                 break;
3874         case ELCG_AUTO:
3875                 gate_ctrl = set_field(gate_ctrl,
3876                                 therm_gate_ctrl_eng_clk_m(),
3877                                 therm_gate_ctrl_eng_clk_auto_f());
3878                 break;
3879         default:
3880                 gk20a_err(dev_from_gk20a(g),
3881                         "invalid elcg mode %d", mode);
3882         }
3883
3884         if (tegra_platform_is_linsim()) {
3885                 gate_ctrl = set_field(gate_ctrl,
3886                         therm_gate_ctrl_eng_delay_after_m(),
3887                         therm_gate_ctrl_eng_delay_after_f(4));
3888         }
3889
3890         /* 2 * (1 << 9) = 1024 clks */
3891         gate_ctrl = set_field(gate_ctrl,
3892                 therm_gate_ctrl_eng_idle_filt_exp_m(),
3893                 therm_gate_ctrl_eng_idle_filt_exp_f(9));
3894         gate_ctrl = set_field(gate_ctrl,
3895                 therm_gate_ctrl_eng_idle_filt_mant_m(),
3896                 therm_gate_ctrl_eng_idle_filt_mant_f(2));
3897         gk20a_writel(g, therm_gate_ctrl_r(engine), gate_ctrl);
3898
3899         /* default fecs_idle_filter to 0 */
3900         idle_filter = gk20a_readl(g, therm_fecs_idle_filter_r());
3901         idle_filter &= ~therm_fecs_idle_filter_value_m();
3902         gk20a_writel(g, therm_fecs_idle_filter_r(), idle_filter);
3903         /* default hubmmu_idle_filter to 0 */
3904         idle_filter = gk20a_readl(g, therm_hubmmu_idle_filter_r());
3905         idle_filter &= ~therm_hubmmu_idle_filter_value_m();
3906         gk20a_writel(g, therm_hubmmu_idle_filter_r(), idle_filter);
3907 }
3908
3909 static int gr_gk20a_zcull_init_hw(struct gk20a *g, struct gr_gk20a *gr)
3910 {
3911         u32 gpc_index, gpc_tpc_count, gpc_zcull_count;
3912         u32 *zcull_map_tiles, *zcull_bank_counters;
3913         u32 map_counter;
3914         u32 rcp_conserv;
3915         u32 offset;
3916         bool floorsweep = false;
3917
3918         if (!gr->map_tiles)
3919                 return -1;
3920
3921         zcull_map_tiles = kzalloc(proj_scal_max_gpcs_v() *
3922                         proj_scal_max_tpc_per_gpc_v() * sizeof(u32), GFP_KERNEL);
3923         if (!zcull_map_tiles) {
3924                 gk20a_err(dev_from_gk20a(g),
3925                         "failed to allocate zcull temp buffers");
3926                 return -ENOMEM;
3927         }
3928         zcull_bank_counters = kzalloc(proj_scal_max_gpcs_v() *
3929                         proj_scal_max_tpc_per_gpc_v() * sizeof(u32), GFP_KERNEL);
3930
3931         if (!zcull_bank_counters) {
3932                 gk20a_err(dev_from_gk20a(g),
3933                         "failed to allocate zcull temp buffers");
3934                 kfree(zcull_map_tiles);
3935                 return -ENOMEM;
3936         }
3937
3938         for (map_counter = 0; map_counter < gr->tpc_count; map_counter++) {
3939                 zcull_map_tiles[map_counter] =
3940                         zcull_bank_counters[gr->map_tiles[map_counter]];
3941                 zcull_bank_counters[gr->map_tiles[map_counter]]++;
3942         }
3943
3944         gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map0_r(),
3945                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_0_f(zcull_map_tiles[0]) |
3946                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_1_f(zcull_map_tiles[1]) |
3947                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_2_f(zcull_map_tiles[2]) |
3948                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_3_f(zcull_map_tiles[3]) |
3949                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_4_f(zcull_map_tiles[4]) |
3950                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_5_f(zcull_map_tiles[5]) |
3951                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_6_f(zcull_map_tiles[6]) |
3952                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_7_f(zcull_map_tiles[7]));
3953
3954         gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map1_r(),
3955                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_8_f(zcull_map_tiles[8]) |
3956                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_9_f(zcull_map_tiles[9]) |
3957                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_10_f(zcull_map_tiles[10]) |
3958                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_11_f(zcull_map_tiles[11]) |
3959                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_12_f(zcull_map_tiles[12]) |
3960                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_13_f(zcull_map_tiles[13]) |
3961                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_14_f(zcull_map_tiles[14]) |
3962                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_15_f(zcull_map_tiles[15]));
3963
3964         gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map2_r(),
3965                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_16_f(zcull_map_tiles[16]) |
3966                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_17_f(zcull_map_tiles[17]) |
3967                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_18_f(zcull_map_tiles[18]) |
3968                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_19_f(zcull_map_tiles[19]) |
3969                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_20_f(zcull_map_tiles[20]) |
3970                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_21_f(zcull_map_tiles[21]) |
3971                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_22_f(zcull_map_tiles[22]) |
3972                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_f(zcull_map_tiles[23]));
3973
3974         gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map3_r(),
3975                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_24_f(zcull_map_tiles[24]) |
3976                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_25_f(zcull_map_tiles[25]) |
3977                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_26_f(zcull_map_tiles[26]) |
3978                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_27_f(zcull_map_tiles[27]) |
3979                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_28_f(zcull_map_tiles[28]) |
3980                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_29_f(zcull_map_tiles[29]) |
3981                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_30_f(zcull_map_tiles[30]) |
3982                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_31_f(zcull_map_tiles[31]));
3983
3984         kfree(zcull_map_tiles);
3985         kfree(zcull_bank_counters);
3986
3987         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3988                 gpc_tpc_count = gr->gpc_tpc_count[gpc_index];
3989                 gpc_zcull_count = gr->gpc_zcb_count[gpc_index];
3990
3991                 if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
3992                     gpc_zcull_count < gpc_tpc_count) {
3993                         gk20a_err(dev_from_gk20a(g),
3994                                 "zcull_banks (%d) less than tpcs (%d) for gpc (%d)",
3995                                 gpc_zcull_count, gpc_tpc_count, gpc_index);
3996                         return -EINVAL;
3997                 }
3998                 if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
3999                     gpc_zcull_count != 0)
4000                         floorsweep = true;
4001         }
4002
4003         /* 1.0f / 1.0f * gr_gpc0_zcull_sm_num_rcp_conservative__max_v() */
4004         rcp_conserv = gr_gpc0_zcull_sm_num_rcp_conservative__max_v();
4005
4006         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4007                 offset = gpc_index * proj_gpc_stride_v();
4008
4009                 if (floorsweep) {
4010                         gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
4011                                 gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
4012                                 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
4013                                         gr->max_zcull_per_gpc_count));
4014                 } else {
4015                         gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
4016                                 gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
4017                                 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
4018                                         gr->gpc_tpc_count[gpc_index]));
4019                 }
4020
4021                 gk20a_writel(g, gr_gpc0_zcull_fs_r() + offset,
4022                         gr_gpc0_zcull_fs_num_active_banks_f(gr->gpc_zcb_count[gpc_index]) |
4023                         gr_gpc0_zcull_fs_num_sms_f(gr->tpc_count));
4024
4025                 gk20a_writel(g, gr_gpc0_zcull_sm_num_rcp_r() + offset,
4026                         gr_gpc0_zcull_sm_num_rcp_conservative_f(rcp_conserv));
4027         }
4028
4029         gk20a_writel(g, gr_gpcs_ppcs_wwdx_sm_num_rcp_r(),
4030                 gr_gpcs_ppcs_wwdx_sm_num_rcp_conservative_f(rcp_conserv));
4031
4032         return 0;
4033 }
4034
4035 static void gk20a_gr_enable_gpc_exceptions(struct gk20a *g)
4036 {
4037         /* enable tpc exception forwarding */
4038         gk20a_writel(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r(),
4039                 gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f());
4040
4041         /* enable gpc exception forwarding */
4042         gk20a_writel(g, gr_gpc0_gpccs_gpc_exception_en_r(),
4043                 gr_gpc0_gpccs_gpc_exception_en_tpc_0_enabled_f());
4044 }
4045
4046
4047 void gr_gk20a_enable_hww_exceptions(struct gk20a *g)
4048 {
4049         /* enable exceptions */
4050         gk20a_writel(g, gr_fe_hww_esr_r(),
4051                      gr_fe_hww_esr_en_enable_f() |
4052                      gr_fe_hww_esr_reset_active_f());
4053         gk20a_writel(g, gr_memfmt_hww_esr_r(),
4054                      gr_memfmt_hww_esr_en_enable_f() |
4055                      gr_memfmt_hww_esr_reset_active_f());
4056         gk20a_writel(g, gr_scc_hww_esr_r(),
4057                      gr_scc_hww_esr_en_enable_f() |
4058                      gr_scc_hww_esr_reset_active_f());
4059         gk20a_writel(g, gr_mme_hww_esr_r(),
4060                      gr_mme_hww_esr_en_enable_f() |
4061                      gr_mme_hww_esr_reset_active_f());
4062         gk20a_writel(g, gr_pd_hww_esr_r(),
4063                      gr_pd_hww_esr_en_enable_f() |
4064                      gr_pd_hww_esr_reset_active_f());
4065         gk20a_writel(g, gr_sked_hww_esr_r(), /* enabled by default */
4066                      gr_sked_hww_esr_reset_active_f());
4067         gk20a_writel(g, gr_ds_hww_esr_r(),
4068                      gr_ds_hww_esr_en_enabled_f() |
4069                      gr_ds_hww_esr_reset_task_f());
4070         gk20a_writel(g, gr_ds_hww_report_mask_r(),
4071                      gr_ds_hww_report_mask_sph0_err_report_f() |
4072                      gr_ds_hww_report_mask_sph1_err_report_f() |
4073                      gr_ds_hww_report_mask_sph2_err_report_f() |
4074                      gr_ds_hww_report_mask_sph3_err_report_f() |
4075                      gr_ds_hww_report_mask_sph4_err_report_f() |
4076                      gr_ds_hww_report_mask_sph5_err_report_f() |
4077                      gr_ds_hww_report_mask_sph6_err_report_f() |
4078                      gr_ds_hww_report_mask_sph7_err_report_f() |
4079                      gr_ds_hww_report_mask_sph8_err_report_f() |
4080                      gr_ds_hww_report_mask_sph9_err_report_f() |
4081                      gr_ds_hww_report_mask_sph10_err_report_f() |
4082                      gr_ds_hww_report_mask_sph11_err_report_f() |
4083                      gr_ds_hww_report_mask_sph12_err_report_f() |
4084                      gr_ds_hww_report_mask_sph13_err_report_f() |
4085                      gr_ds_hww_report_mask_sph14_err_report_f() |
4086                      gr_ds_hww_report_mask_sph15_err_report_f() |
4087                      gr_ds_hww_report_mask_sph16_err_report_f() |
4088                      gr_ds_hww_report_mask_sph17_err_report_f() |
4089                      gr_ds_hww_report_mask_sph18_err_report_f() |
4090                      gr_ds_hww_report_mask_sph19_err_report_f() |
4091                      gr_ds_hww_report_mask_sph20_err_report_f() |
4092                      gr_ds_hww_report_mask_sph21_err_report_f() |
4093                      gr_ds_hww_report_mask_sph22_err_report_f() |
4094                      gr_ds_hww_report_mask_sph23_err_report_f());
4095 }
4096
4097 static void gr_gk20a_set_hww_esr_report_mask(struct gk20a *g)
4098 {
4099         /* setup sm warp esr report masks */
4100         gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
4101                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f() |
4102                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f() |
4103                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f() |
4104                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f() |
4105                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f() |
4106                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f() |
4107                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f() |
4108                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f() |
4109                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() |
4110                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f() |
4111                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f() |
4112                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f() |
4113                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f() |
4114                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f() |
4115                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f() |
4116                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f() |
4117                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f() |
4118                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() |
4119                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f() |
4120                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f());
4121
4122         /* setup sm global esr report mask */
4123         gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
4124                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f() |
4125                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f() |
4126                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f() |
4127                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f() |
4128                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f() |
4129                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f() |
4130                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f());
4131 }
4132
4133 static int gk20a_init_gr_setup_hw(struct gk20a *g)
4134 {
4135         struct gr_gk20a *gr = &g->gr;
4136         struct aiv_list_gk20a *sw_ctx_load = &g->gr.ctx_vars.sw_ctx_load;
4137         struct av_list_gk20a *sw_method_init = &g->gr.ctx_vars.sw_method_init;
4138         u32 data;
4139         u32 addr_lo, addr_hi;
4140         u64 addr;
4141         unsigned long end_jiffies = jiffies +
4142                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
4143         u32 fe_go_idle_timeout_save;
4144         u32 last_method_data = 0;
4145         u32 i, err;
4146
4147         gk20a_dbg_fn("");
4148
4149         /* slcg prod values */
4150         g->ops.clock_gating.slcg_gr_load_gating_prod(g, g->slcg_enabled);
4151         g->ops.clock_gating.slcg_perf_load_gating_prod(g, g->slcg_enabled);
4152
4153         /* init mmu debug buffer */
4154         addr = NV_MC_SMMU_VADDR_TRANSLATE(gr->mmu_wr_mem.iova);
4155         addr_lo = u64_lo32(addr);
4156         addr_hi = u64_hi32(addr);
4157         addr = (addr_lo >> fb_mmu_debug_wr_addr_alignment_v()) |
4158                 (addr_hi << (32 - fb_mmu_debug_wr_addr_alignment_v()));
4159
4160         gk20a_writel(g, fb_mmu_debug_wr_r(),
4161                      fb_mmu_debug_wr_aperture_vid_mem_f() |
4162                      fb_mmu_debug_wr_vol_false_f() |
4163                      fb_mmu_debug_wr_addr_v(addr));
4164
4165         addr = NV_MC_SMMU_VADDR_TRANSLATE(gr->mmu_rd_mem.iova);
4166         addr_lo = u64_lo32(addr);
4167         addr_hi = u64_hi32(addr);
4168         addr = (addr_lo >> fb_mmu_debug_rd_addr_alignment_v()) |
4169                 (addr_hi << (32 - fb_mmu_debug_rd_addr_alignment_v()));
4170
4171         gk20a_writel(g, fb_mmu_debug_rd_r(),
4172                      fb_mmu_debug_rd_aperture_vid_mem_f() |
4173                      fb_mmu_debug_rd_vol_false_f() |
4174                      fb_mmu_debug_rd_addr_v(addr));
4175
4176         /* load gr floorsweeping registers */
4177         data = gk20a_readl(g, gr_gpc0_ppc0_pes_vsc_strem_r());
4178         data = set_field(data, gr_gpc0_ppc0_pes_vsc_strem_master_pe_m(),
4179                         gr_gpc0_ppc0_pes_vsc_strem_master_pe_true_f());
4180         gk20a_writel(g, gr_gpc0_ppc0_pes_vsc_strem_r(), data);
4181
4182         gr_gk20a_zcull_init_hw(g, gr);
4183
4184         g->ops.clock_gating.blcg_gr_load_gating_prod(g, g->blcg_enabled);
4185         g->ops.clock_gating.pg_gr_load_gating_prod(g, true);
4186
4187         if (g->elcg_enabled) {
4188                 gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_GR_GK20A);
4189                 gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_CE2_GK20A);
4190         } else {
4191                 gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_GR_GK20A);
4192                 gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_CE2_GK20A);
4193         }
4194
4195         /* Bug 1340570: increase the clock timeout to avoid potential
4196          * operation failure at high gpcclk rate. Default values are 0x400.
4197          */
4198         gk20a_writel(g, pri_ringstation_sys_master_config_r(0x15), 0x800);
4199         gk20a_writel(g, pri_ringstation_gpc_master_config_r(0xa), 0x800);
4200         gk20a_writel(g, pri_ringstation_fbp_master_config_r(0x8), 0x800);
4201
4202         /* enable fifo access */
4203         gk20a_writel(g, gr_gpfifo_ctl_r(),
4204                      gr_gpfifo_ctl_access_enabled_f() |
4205                      gr_gpfifo_ctl_semaphore_access_enabled_f());
4206
4207         /* TBD: reload gr ucode when needed */
4208
4209         /* enable interrupts */
4210         gk20a_writel(g, gr_intr_r(), 0xFFFFFFFF);
4211         gk20a_writel(g, gr_intr_en_r(), 0xFFFFFFFF);
4212
4213         /* enable fecs error interrupts */
4214         gk20a_writel(g, gr_fecs_host_int_enable_r(),
4215                      gr_fecs_host_int_enable_fault_during_ctxsw_enable_f() |
4216                      gr_fecs_host_int_enable_umimp_firmware_method_enable_f() |
4217                      gr_fecs_host_int_enable_umimp_illegal_method_enable_f() |
4218                      gr_fecs_host_int_enable_watchdog_enable_f());
4219
4220         g->ops.gr.enable_hww_exceptions(g);
4221         g->ops.gr.set_hww_esr_report_mask(g);
4222
4223         /* enable per GPC exceptions */
4224         gk20a_gr_enable_gpc_exceptions(g);
4225
4226         /* TBD: ECC for L1/SM */
4227         /* TBD: enable per BE exceptions */
4228
4229         /* reset and enable all exceptions */
4230         gk20a_writel(g, gr_exception_r(), 0xFFFFFFFF);
4231         gk20a_writel(g, gr_exception_en_r(), 0xFFFFFFFF);
4232         gk20a_writel(g, gr_exception1_r(), 0xFFFFFFFF);
4233         gk20a_writel(g, gr_exception1_en_r(), 0xFFFFFFFF);
4234         gk20a_writel(g, gr_exception2_r(), 0xFFFFFFFF);
4235         gk20a_writel(g, gr_exception2_en_r(), 0xFFFFFFFF);
4236
4237         /* ignore status from some units */
4238         data = gk20a_readl(g, gr_status_mask_r());
4239         gk20a_writel(g, gr_status_mask_r(), data & gr->status_disable_mask);
4240
4241         if (gr->sw_ready)
4242                 gr_gk20a_load_zbc_table(g, gr);
4243         else
4244                 gr_gk20a_load_zbc_default_table(g, gr);
4245
4246         g->ops.ltc.init_cbc(g, gr);
4247
4248         /* load ctx init */
4249         for (i = 0; i < sw_ctx_load->count; i++)
4250                 gk20a_writel(g, sw_ctx_load->l[i].addr,
4251                              sw_ctx_load->l[i].value);
4252
4253         err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4254         if (err)
4255                 goto out;
4256
4257         /* save and disable fe_go_idle */
4258         fe_go_idle_timeout_save =
4259                 gk20a_readl(g, gr_fe_go_idle_timeout_r());
4260         gk20a_writel(g, gr_fe_go_idle_timeout_r(),
4261                 (fe_go_idle_timeout_save & gr_fe_go_idle_timeout_count_f(0)) |
4262                 gr_fe_go_idle_timeout_count_disabled_f());
4263
4264         /* override a few ctx state registers */
4265         g->ops.gr.commit_global_cb_manager(g, NULL, false);
4266         gr_gk20a_commit_global_timeslice(g, NULL, false);
4267
4268         /* floorsweep anything left */
4269         g->ops.gr.init_fs_state(g);
4270
4271         err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4272         if (err)
4273                 goto restore_fe_go_idle;
4274
4275 restore_fe_go_idle:
4276         /* restore fe_go_idle */
4277         gk20a_writel(g, gr_fe_go_idle_timeout_r(), fe_go_idle_timeout_save);
4278
4279         if (err || gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT))
4280                 goto out;
4281
4282         /* load method init */
4283         if (sw_method_init->count) {
4284                 gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
4285                              sw_method_init->l[0].value);
4286                 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
4287                              gr_pri_mme_shadow_raw_index_write_trigger_f() |
4288                              sw_method_init->l[0].addr);
4289                 last_method_data = sw_method_init->l[0].value;
4290         }
4291         for (i = 1; i < sw_method_init->count; i++) {
4292                 if (sw_method_init->l[i].value != last_method_data) {
4293                         gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
4294                                 sw_method_init->l[i].value);
4295                         last_method_data = sw_method_init->l[i].value;
4296                 }
4297                 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
4298                         gr_pri_mme_shadow_raw_index_write_trigger_f() |
4299                         sw_method_init->l[i].addr);
4300         }
4301
4302         err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4303         if (err)
4304                 goto out;
4305
4306 out:
4307         gk20a_dbg_fn("done");
4308         return 0;
4309 }
4310
4311 static int gk20a_init_gr_prepare(struct gk20a *g)
4312 {
4313         u32 gpfifo_ctrl, pmc_en;
4314         u32 err = 0;
4315
4316         /* disable fifo access */
4317         pmc_en = gk20a_readl(g, mc_enable_r());
4318         if (pmc_en & mc_enable_pgraph_enabled_f()) {
4319                 gpfifo_ctrl = gk20a_readl(g, gr_gpfifo_ctl_r());
4320                 gpfifo_ctrl &= ~gr_gpfifo_ctl_access_enabled_f();
4321                 gk20a_writel(g, gr_gpfifo_ctl_r(), gpfifo_ctrl);
4322         }
4323
4324         /* reset gr engine */
4325         gk20a_reset(g, mc_enable_pgraph_enabled_f()
4326                         | mc_enable_blg_enabled_f()
4327                         | mc_enable_perfmon_enabled_f());
4328
4329         /* enable fifo access */
4330         gk20a_writel(g, gr_gpfifo_ctl_r(),
4331                 gr_gpfifo_ctl_access_enabled_f() |
4332                 gr_gpfifo_ctl_semaphore_access_enabled_f());
4333
4334         if (!g->gr.ctx_vars.valid) {
4335                 err = gr_gk20a_init_ctx_vars(g, &g->gr);
4336                 if (err)
4337                         gk20a_err(dev_from_gk20a(g),
4338                                 "fail to load gr init ctx");
4339         }
4340         return err;
4341 }
4342
4343 static int gr_gk20a_wait_mem_scrubbing(struct gk20a *g)
4344 {
4345         int retries = GR_IDLE_CHECK_MAX / GR_IDLE_CHECK_DEFAULT;
4346         bool fecs_scrubbing;
4347         bool gpccs_scrubbing;
4348
4349         gk20a_dbg_fn("");
4350
4351         do {
4352                 fecs_scrubbing = gk20a_readl(g, gr_fecs_dmactl_r()) &
4353                         (gr_fecs_dmactl_imem_scrubbing_m() |
4354                          gr_fecs_dmactl_dmem_scrubbing_m());
4355
4356                 gpccs_scrubbing = gk20a_readl(g, gr_gpccs_dmactl_r()) &
4357                         (gr_gpccs_dmactl_imem_scrubbing_m() |
4358                          gr_gpccs_dmactl_imem_scrubbing_m());
4359
4360                 if (!fecs_scrubbing && !gpccs_scrubbing) {
4361                         gk20a_dbg_fn("done");
4362                         return 0;
4363                 }
4364
4365                 udelay(GR_IDLE_CHECK_DEFAULT);
4366         } while (--retries || !tegra_platform_is_silicon());
4367
4368         gk20a_err(dev_from_gk20a(g), "Falcon mem scrubbing timeout");
4369         return -ETIMEDOUT;
4370 }
4371
4372 static int gk20a_init_gr_reset_enable_hw(struct gk20a *g)
4373 {
4374         struct gr_gk20a *gr = &g->gr;
4375         struct av_list_gk20a *sw_non_ctx_load = &g->gr.ctx_vars.sw_non_ctx_load;
4376         unsigned long end_jiffies = jiffies +
4377                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
4378         u32 i, err = 0;
4379
4380         gk20a_dbg_fn("");
4381
4382         /* enable interrupts */
4383         gk20a_writel(g, gr_intr_r(), ~0);
4384         gk20a_writel(g, gr_intr_en_r(), ~0);
4385
4386         /* reset ctx switch state */
4387         gr_gk20a_ctx_reset(g, 0);
4388
4389         /* clear scc ram */
4390         gk20a_writel(g, gr_scc_init_r(),
4391                 gr_scc_init_ram_trigger_f());
4392
4393         /* load non_ctx init */
4394         for (i = 0; i < sw_non_ctx_load->count; i++)
4395                 gk20a_writel(g, sw_non_ctx_load->l[i].addr,
4396                         sw_non_ctx_load->l[i].value);
4397
4398         err = gr_gk20a_wait_mem_scrubbing(g);
4399         if (err)
4400                 goto out;
4401
4402         err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4403         if (err)
4404                 goto out;
4405
4406         err = gr_gk20a_load_ctxsw_ucode(g, gr);
4407         if (err)
4408                 goto out;
4409
4410         /* this appears query for sw states but fecs actually init
4411            ramchain, etc so this is hw init */
4412         err = gr_gk20a_init_ctx_state(g, gr);
4413         if (err)
4414                 goto out;
4415
4416 out:
4417         if (err)
4418                 gk20a_err(dev_from_gk20a(g), "fail");
4419         else
4420                 gk20a_dbg_fn("done");
4421
4422         return 0;
4423 }
4424
4425 /*
4426  * XXX Merge this list with the debugger/profiler
4427  * session regops whitelists?
4428  */
4429 static u32 wl_addr_gk20a[] = {
4430         /* this list must be sorted (low to high) */
4431         0x404468, /* gr_pri_mme_max_instructions       */
4432         0x408944, /* gr_pri_bes_crop_hww_esr           */
4433         0x418800, /* gr_pri_gpcs_setup_debug           */
4434         0x419a04, /* gr_pri_gpcs_tpcs_tex_lod_dbg      */
4435         0x419a08, /* gr_pri_gpcs_tpcs_tex_samp_dbg     */
4436         0x419e10, /* gr_pri_gpcs_tpcs_sm_dbgr_control0 */
4437         0x419f78, /* gr_pri_gpcs_tpcs_sm_disp_ctrl     */
4438 };
4439
4440 static int gr_gk20a_init_access_map(struct gk20a *g)
4441 {
4442         struct gr_gk20a *gr = &g->gr;
4443         void *data;
4444         int err = 0;
4445         u32 w, nr_pages =
4446                 DIV_ROUND_UP(gr->ctx_vars.priv_access_map_size,
4447                              PAGE_SIZE);
4448
4449         data = vmap(gr->global_ctx_buffer[PRIV_ACCESS_MAP].pages,
4450                     PAGE_ALIGN(gr->global_ctx_buffer[PRIV_ACCESS_MAP].size) >>
4451                     PAGE_SHIFT, 0, pgprot_dmacoherent(PAGE_KERNEL));
4452         if (!data) {
4453                 gk20a_err(dev_from_gk20a(g),
4454                           "failed to map priv access map memory");
4455                 err = -ENOMEM;
4456                 goto clean_up;
4457         }
4458
4459         memset(data, 0x0, PAGE_SIZE * nr_pages);
4460
4461         for (w = 0; w < ARRAY_SIZE(wl_addr_gk20a); w++) {
4462                 u32 map_bit, map_byte, map_shift;
4463                 map_bit = wl_addr_gk20a[w] >> 2;
4464                 map_byte = map_bit >> 3;
4465                 map_shift = map_bit & 0x7; /* i.e. 0-7 */
4466                 gk20a_dbg_info("access map addr:0x%x byte:0x%x bit:%d",
4467                   wl_addr_gk20a[w], map_byte, map_shift);
4468                 ((u8 *)data)[map_byte] |= 1 << map_shift;
4469         }
4470
4471 clean_up:
4472         if (data)
4473                 vunmap(data);
4474         return 0;
4475 }
4476
4477 static int gk20a_init_gr_setup_sw(struct gk20a *g)
4478 {
4479         struct gr_gk20a *gr = &g->gr;
4480         int err;
4481
4482         gk20a_dbg_fn("");
4483
4484         if (gr->sw_ready) {
4485                 gk20a_dbg_fn("skip init");
4486                 return 0;
4487         }
4488
4489         gr->g = g;
4490
4491         err = gr_gk20a_init_gr_config(g, gr);
4492         if (err)
4493                 goto clean_up;
4494
4495         err = gr_gk20a_init_mmu_sw(g, gr);
4496         if (err)
4497                 goto clean_up;
4498
4499         err = gr_gk20a_init_map_tiles(g, gr);
4500         if (err)
4501                 goto clean_up;
4502
4503         if (tegra_cpu_is_asim())
4504                 gr->max_comptag_mem = 1; /* MBs worth of comptag coverage */
4505         else {
4506                 gk20a_dbg_info("total ram pages : %lu", totalram_pages);
4507                 gr->max_comptag_mem = totalram_pages
4508                                          >> (10 - (PAGE_SHIFT - 10));
4509         }
4510         err = g->ops.ltc.init_comptags(g, gr);
4511         if (err)
4512                 goto clean_up;
4513
4514         err = gr_gk20a_init_zcull(g, gr);
4515         if (err)
4516                 goto clean_up;
4517
4518         err = gr_gk20a_alloc_global_ctx_buffers(g);
4519         if (err)
4520                 goto clean_up;
4521
4522         err = gr_gk20a_init_access_map(g);
4523         if (err)
4524                 goto clean_up;
4525
4526         mutex_init(&gr->ctx_mutex);
4527         spin_lock_init(&gr->ch_tlb_lock);
4528
4529         gr->remove_support = gk20a_remove_gr_support;
4530         gr->sw_ready = true;
4531
4532         gk20a_dbg_fn("done");
4533         return 0;
4534
4535 clean_up:
4536         gk20a_err(dev_from_gk20a(g), "fail");
4537         gk20a_remove_gr_support(gr);
4538         return err;
4539 }
4540
4541 int gk20a_init_gr_support(struct gk20a *g)
4542 {
4543         u32 err;
4544
4545         gk20a_dbg_fn("");
4546
4547         err = gk20a_init_gr_prepare(g);
4548         if (err)
4549                 return err;
4550
4551         /* this is required before gr_gk20a_init_ctx_state */
4552         mutex_init(&g->gr.fecs_mutex);
4553
4554         err = gk20a_init_gr_reset_enable_hw(g);
4555         if (err)
4556                 return err;
4557
4558         err = gk20a_init_gr_setup_sw(g);
4559         if (err)
4560                 return err;
4561
4562         err = gk20a_init_gr_setup_hw(g);
4563         if (err)
4564                 return err;
4565
4566         /* GR is inialized, signal possible waiters */
4567         g->gr.initialized = true;
4568         wake_up(&g->gr.init_wq);
4569
4570         return 0;
4571 }
4572
4573 /* Wait until GR is initialized */
4574 void gk20a_gr_wait_initialized(struct gk20a *g)
4575 {
4576         wait_event(g->gr.init_wq, g->gr.initialized);
4577 }
4578
4579 #define NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE   0x02dc
4580 #define NVA297_SET_CIRCULAR_BUFFER_SIZE         0x1280
4581 #define NVA297_SET_SHADER_EXCEPTIONS            0x1528
4582 #define NVA0C0_SET_SHADER_EXCEPTIONS            0x1528
4583
4584 #define NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE 0
4585
4586 struct gr_isr_data {
4587         u32 addr;
4588         u32 data_lo;
4589         u32 data_hi;
4590         u32 curr_ctx;
4591         u32 chid;
4592         u32 offset;
4593         u32 sub_chan;
4594         u32 class_num;
4595 };
4596
4597 void gk20a_gr_set_shader_exceptions(struct gk20a *g, u32 data)
4598 {
4599         gk20a_dbg_fn("");
4600
4601         if (data == NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE) {
4602                 gk20a_writel(g,
4603                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(), 0);
4604                 gk20a_writel(g,
4605                         gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(), 0);
4606         } else {
4607                 /* setup sm warp esr report masks */
4608                 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
4609                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f() |
4610                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f() |
4611                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f() |
4612                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f() |
4613                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f() |
4614                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f() |
4615                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f() |
4616                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f() |
4617                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() |
4618                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f() |
4619                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f() |
4620                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f() |
4621                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f() |
4622                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f() |
4623                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f() |
4624                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f() |
4625                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f() |
4626                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() |
4627                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f() |
4628                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f());
4629
4630                 /* setup sm global esr report mask */
4631                 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
4632                         gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f() |
4633                         gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f() |
4634                         gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f() |
4635                         gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f() |
4636                         gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f() |
4637                         gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f() |
4638                         gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f());
4639         }
4640 }
4641
4642 static void gk20a_gr_set_circular_buffer_size(struct gk20a *g, u32 data)
4643 {
4644         struct gr_gk20a *gr = &g->gr;
4645         u32 gpc_index, ppc_index, stride, val, offset;
4646         u32 cb_size = data * 4;
4647
4648         gk20a_dbg_fn("");
4649
4650         if (cb_size > gr->attrib_cb_size)
4651                 cb_size = gr->attrib_cb_size;
4652
4653         gk20a_writel(g, gr_ds_tga_constraintlogic_r(),
4654                 (gk20a_readl(g, gr_ds_tga_constraintlogic_r()) &
4655                  ~gr_ds_tga_constraintlogic_beta_cbsize_f(~0)) |
4656                  gr_ds_tga_constraintlogic_beta_cbsize_f(cb_size));
4657
4658         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4659                 stride = proj_gpc_stride_v() * gpc_index;
4660
4661                 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
4662                         ppc_index++) {
4663
4664                         val = gk20a_readl(g, gr_gpc0_ppc0_cbm_cfg_r() +
4665                                 stride +
4666                                 proj_ppc_in_gpc_stride_v() * ppc_index);
4667
4668                         offset = gr_gpc0_ppc0_cbm_cfg_start_offset_v(val);
4669
4670                         val = set_field(val,
4671                                 gr_gpc0_ppc0_cbm_cfg_size_m(),
4672                                 gr_gpc0_ppc0_cbm_cfg_size_f(cb_size *
4673                                         gr->pes_tpc_count[ppc_index][gpc_index]));
4674                         val = set_field(val,
4675                                 gr_gpc0_ppc0_cbm_cfg_start_offset_m(),
4676                                 (offset + 1));
4677
4678                         gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg_r() +
4679                                 stride +
4680                                 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4681
4682                         val = set_field(val,
4683                                 gr_gpc0_ppc0_cbm_cfg_start_offset_m(),
4684                                 offset);
4685
4686                         gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg_r() +
4687                                 stride +
4688                                 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4689                 }
4690         }
4691 }
4692
4693 static void gk20a_gr_set_alpha_circular_buffer_size(struct gk20a *g, u32 data)
4694 {
4695         struct gr_gk20a *gr = &g->gr;
4696         u32 gpc_index, ppc_index, stride, val;
4697         u32 pd_ab_max_output;
4698         u32 alpha_cb_size = data * 4;
4699
4700         gk20a_dbg_fn("");
4701         /* if (NO_ALPHA_BETA_TIMESLICE_SUPPORT_DEF)
4702                 return; */
4703
4704         if (alpha_cb_size > gr->alpha_cb_size)
4705                 alpha_cb_size = gr->alpha_cb_size;
4706
4707         gk20a_writel(g, gr_ds_tga_constraintlogic_r(),
4708                 (gk20a_readl(g, gr_ds_tga_constraintlogic_r()) &
4709                  ~gr_ds_tga_constraintlogic_alpha_cbsize_f(~0)) |
4710                  gr_ds_tga_constraintlogic_alpha_cbsize_f(alpha_cb_size));
4711
4712         pd_ab_max_output = alpha_cb_size *
4713                 gr_gpc0_ppc0_cbm_cfg_size_granularity_v() /
4714                 gr_pd_ab_dist_cfg1_max_output_granularity_v();
4715
4716         gk20a_writel(g, gr_pd_ab_dist_cfg1_r(),
4717                 gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output));
4718
4719         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4720                 stride = proj_gpc_stride_v() * gpc_index;
4721
4722                 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
4723                         ppc_index++) {
4724
4725                         val = gk20a_readl(g, gr_gpc0_ppc0_cbm_cfg2_r() +
4726                                 stride +
4727                                 proj_ppc_in_gpc_stride_v() * ppc_index);
4728
4729                         val = set_field(val, gr_gpc0_ppc0_cbm_cfg2_size_m(),
4730                                         gr_gpc0_ppc0_cbm_cfg2_size_f(alpha_cb_size *
4731                                                 gr->pes_tpc_count[ppc_index][gpc_index]));
4732
4733                         gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg2_r() +
4734                                 stride +
4735                                 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4736                 }
4737         }
4738 }
4739
4740 int gk20a_gr_reset(struct gk20a *g)
4741 {
4742         int err;
4743         u32 size;
4744
4745         err = gk20a_init_gr_prepare(g);
4746         if (err)
4747                 return err;
4748
4749         err = gk20a_init_gr_reset_enable_hw(g);
4750         if (err)
4751                 return err;
4752
4753         err = gk20a_init_gr_setup_hw(g);
4754         if (err)
4755                 return err;
4756
4757         size = 0;
4758         err = gr_gk20a_fecs_get_reglist_img_size(g, &size);
4759         if (err) {
4760                 gk20a_err(dev_from_gk20a(g),
4761                         "fail to query fecs pg buffer size");
4762                 return err;
4763         }
4764
4765         err = gr_gk20a_fecs_set_reglist_bind_inst(g,
4766                         g->mm.pmu.inst_block.cpu_pa);
4767         if (err) {
4768                 gk20a_err(dev_from_gk20a(g),
4769                         "fail to bind pmu inst to gr");
4770                 return err;
4771         }
4772
4773         err = gr_gk20a_fecs_set_reglist_virtual_addr(g, g->pmu.pg_buf.pmu_va);
4774         if (err) {
4775                 gk20a_err(dev_from_gk20a(g),
4776                         "fail to set pg buffer pmu va");
4777                 return err;
4778         }
4779
4780         return 0;
4781 }
4782
4783 static int gr_gk20a_handle_sw_method(struct gk20a *g, u32 addr,
4784                                           u32 class_num, u32 offset, u32 data)
4785 {
4786         gk20a_dbg_fn("");
4787
4788         if (class_num == KEPLER_COMPUTE_A) {
4789                 switch (offset << 2) {
4790                 case NVA0C0_SET_SHADER_EXCEPTIONS:
4791                         gk20a_gr_set_shader_exceptions(g, data);
4792                         break;
4793                 default:
4794                         goto fail;
4795                 }
4796         }
4797
4798         if (class_num == KEPLER_C) {
4799                 switch (offset << 2) {
4800                 case NVA297_SET_SHADER_EXCEPTIONS:
4801                         gk20a_gr_set_shader_exceptions(g, data);
4802                         break;
4803                 case NVA297_SET_CIRCULAR_BUFFER_SIZE:
4804                         g->ops.gr.set_circular_buffer_size(g, data);
4805                         break;
4806                 case NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE:
4807                         g->ops.gr.set_alpha_circular_buffer_size(g, data);
4808                         break;
4809                 default:
4810                         goto fail;
4811                 }
4812         }
4813         return 0;
4814
4815 fail:
4816         return -EINVAL;
4817 }
4818
4819 static int gk20a_gr_handle_semaphore_timeout_pending(struct gk20a *g,
4820                   struct gr_isr_data *isr_data)
4821 {
4822         struct fifo_gk20a *f = &g->fifo;
4823         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4824         gk20a_dbg_fn("");
4825         gk20a_set_error_notifier(ch,
4826                                 NVHOST_CHANNEL_GR_SEMAPHORE_TIMEOUT);
4827         gk20a_err(dev_from_gk20a(g),
4828                    "gr semaphore timeout\n");
4829         return -EINVAL;
4830 }
4831
4832 static int gk20a_gr_intr_illegal_notify_pending(struct gk20a *g,
4833                   struct gr_isr_data *isr_data)
4834 {
4835         struct fifo_gk20a *f = &g->fifo;
4836         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4837         gk20a_dbg_fn("");
4838         gk20a_set_error_notifier(ch,
4839                                 NVHOST_CHANNEL_GR_ILLEGAL_NOTIFY);
4840         /* This is an unrecoverable error, reset is needed */
4841         gk20a_err(dev_from_gk20a(g),
4842                    "gr semaphore timeout\n");
4843         return -EINVAL;
4844 }
4845
4846 static int gk20a_gr_handle_illegal_method(struct gk20a *g,
4847                                           struct gr_isr_data *isr_data)
4848 {
4849         int ret = g->ops.gr.handle_sw_method(g, isr_data->addr,
4850                         isr_data->class_num, isr_data->offset,
4851                         isr_data->data_lo);
4852         if (ret)
4853                 gk20a_err(dev_from_gk20a(g), "invalid method class 0x%08x"
4854                         ", offset 0x%08x address 0x%08x\n",
4855                         isr_data->class_num, isr_data->offset, isr_data->addr);
4856
4857         return ret;
4858 }
4859
4860 static int gk20a_gr_handle_illegal_class(struct gk20a *g,
4861                                           struct gr_isr_data *isr_data)
4862 {
4863         struct fifo_gk20a *f = &g->fifo;
4864         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4865         gk20a_dbg_fn("");
4866         gk20a_set_error_notifier(ch,
4867                                 NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
4868         gk20a_err(dev_from_gk20a(g),
4869                    "invalid class 0x%08x, offset 0x%08x",
4870                    isr_data->class_num, isr_data->offset);
4871         return -EINVAL;
4872 }
4873
4874 static int gk20a_gr_handle_fecs_error(struct gk20a *g,
4875                                           struct gr_isr_data *isr_data)
4876 {
4877         struct fifo_gk20a *f = &g->fifo;
4878         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4879         u32 gr_fecs_intr = gk20a_readl(g, gr_fecs_intr_r());
4880         gk20a_dbg_fn("");
4881
4882         gk20a_err(dev_from_gk20a(g),
4883                    "unhandled fecs error interrupt 0x%08x for channel %u",
4884                    gr_fecs_intr, ch->hw_chid);
4885
4886         gk20a_writel(g, gr_fecs_intr_r(), gr_fecs_intr);
4887         return -EINVAL;
4888 }
4889
4890 static int gk20a_gr_handle_class_error(struct gk20a *g,
4891                                           struct gr_isr_data *isr_data)
4892 {
4893         struct fifo_gk20a *f = &g->fifo;
4894         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4895         u32 gr_class_error =
4896                 gr_class_error_code_v(gk20a_readl(g, gr_class_error_r()));
4897         gk20a_dbg_fn("");
4898
4899         gk20a_set_error_notifier(ch,
4900                         NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
4901         gk20a_err(dev_from_gk20a(g),
4902                    "class error 0x%08x, offset 0x%08x, unhandled intr 0x%08x for channel %u\n",
4903                    isr_data->class_num, isr_data->offset,
4904                    gr_class_error, ch->hw_chid);
4905         return -EINVAL;
4906 }
4907
4908 static int gk20a_gr_handle_semaphore_pending(struct gk20a *g,
4909                                              struct gr_isr_data *isr_data)
4910 {
4911         struct fifo_gk20a *f = &g->fifo;
4912         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4913
4914         wake_up(&ch->semaphore_wq);
4915
4916         return 0;
4917 }
4918
4919 #if defined(CONFIG_GK20A_CYCLE_STATS)
4920 static inline bool is_valid_cyclestats_bar0_offset_gk20a(struct gk20a *g,
4921                                                          u32 offset)
4922 {
4923         /* support only 24-bit 4-byte aligned offsets */
4924         bool valid = !(offset & 0xFF000003);
4925         /* whitelist check */
4926         valid = valid &&
4927                 is_bar0_global_offset_whitelisted_gk20a(offset);
4928         /* resource size check in case there was a problem
4929          * with allocating the assumed size of bar0 */
4930         valid = valid &&
4931                 offset < resource_size(g->reg_mem);
4932         return valid;
4933 }
4934 #endif
4935
4936 static int gk20a_gr_handle_notify_pending(struct gk20a *g,
4937                                           struct gr_isr_data *isr_data)
4938 {
4939         struct fifo_gk20a *f = &g->fifo;
4940         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4941
4942 #if defined(CONFIG_GK20A_CYCLE_STATS)
4943         void *virtual_address;
4944         u32 buffer_size;
4945         u32 offset;
4946         u32 new_offset;
4947         bool exit;
4948         struct share_buffer_head *sh_hdr;
4949         u32 raw_reg;
4950         u64 mask_orig;
4951         u64 v = 0;
4952         struct gk20a_cyclestate_buffer_elem *op_elem;
4953         /* GL will never use payload 0 for cycle state */
4954         if ((ch->cyclestate.cyclestate_buffer == NULL) || (isr_data->data_lo == 0))
4955                 return 0;
4956
4957         mutex_lock(&ch->cyclestate.cyclestate_buffer_mutex);
4958
4959         virtual_address = ch->cyclestate.cyclestate_buffer;
4960         buffer_size = ch->cyclestate.cyclestate_buffer_size;
4961         offset = isr_data->data_lo;
4962         exit = false;
4963         while (!exit) {
4964                 if (offset >= buffer_size) {
4965                         WARN_ON(1);
4966                         break;
4967                 }
4968
4969                 sh_hdr = (struct share_buffer_head *)
4970                         ((char *)virtual_address + offset);
4971
4972                 if (sh_hdr->size < sizeof(struct share_buffer_head)) {
4973                         WARN_ON(1);
4974                         break;
4975                 }
4976                 new_offset = offset + sh_hdr->size;
4977
4978                 switch (sh_hdr->operation) {
4979                 case OP_END:
4980                         exit = true;
4981                         break;
4982
4983                 case BAR0_READ32:
4984                 case BAR0_WRITE32:
4985                 {
4986                         bool valid;
4987                         op_elem =
4988                                 (struct gk20a_cyclestate_buffer_elem *)
4989                                         sh_hdr;
4990                         valid = is_valid_cyclestats_bar0_offset_gk20a(g,
4991                                                         op_elem->offset_bar0);
4992                         if (!valid) {
4993                                 gk20a_err(dev_from_gk20a(g),
4994                                            "invalid cycletstats op offset: 0x%x\n",
4995                                            op_elem->offset_bar0);
4996
4997                                 sh_hdr->failed = exit = true;
4998                                 break;
4999                         }
5000
5001
5002                         mask_orig =
5003                                 ((1ULL <<
5004                                   (op_elem->last_bit + 1))
5005                                  -1)&~((1ULL <<
5006                                         op_elem->first_bit)-1);
5007
5008                         raw_reg =
5009                                 gk20a_readl(g,
5010                                             op_elem->offset_bar0);
5011
5012                         switch (sh_hdr->operation) {
5013                         case BAR0_READ32:
5014                                 op_elem->data =
5015                                         (raw_reg & mask_orig)
5016                                         >> op_elem->first_bit;
5017                                 break;
5018
5019                         case BAR0_WRITE32:
5020                                 v = 0;
5021                                 if ((unsigned int)mask_orig !=
5022                                     (unsigned int)~0) {
5023                                         v = (unsigned int)
5024                                                 (raw_reg & ~mask_orig);
5025                                 }
5026
5027                                 v |= ((op_elem->data
5028                                        << op_elem->first_bit)
5029                                       & mask_orig);
5030
5031                                 gk20a_writel(g,
5032                                              op_elem->offset_bar0,
5033                                              (unsigned int)v);
5034                                 break;
5035                         default:
5036                                 /* nop ok?*/
5037                                 break;
5038                         }
5039                 }
5040                 break;
5041
5042                 default:
5043                         /* no operation content case */
5044                         exit = true;
5045                         break;
5046                 }
5047                 sh_hdr->completed = true;
5048                 offset = new_offset;
5049         }
5050         mutex_unlock(&ch->cyclestate.cyclestate_buffer_mutex);
5051 #endif
5052         gk20a_dbg_fn("");
5053         wake_up(&ch->notifier_wq);
5054         return 0;
5055 }
5056
5057 /* Used by sw interrupt thread to translate current ctx to chid.
5058  * For performance, we don't want to go through 128 channels every time.
5059  * curr_ctx should be the value read from gr_fecs_current_ctx_r().
5060  * A small tlb is used here to cache translation */
5061 static int gk20a_gr_get_chid_from_ctx(struct gk20a *g, u32 curr_ctx)
5062 {
5063         struct fifo_gk20a *f = &g->fifo;
5064         struct gr_gk20a *gr = &g->gr;
5065         u32 chid = -1;
5066         u32 i;
5067
5068         /* when contexts are unloaded from GR, the valid bit is reset
5069          * but the instance pointer information remains intact. So the
5070          * valid bit must be checked to be absolutely certain that a
5071          * valid context is currently resident. */
5072         if (!gr_fecs_current_ctx_valid_v(curr_ctx))
5073                 return -1;
5074
5075         spin_lock(&gr->ch_tlb_lock);
5076
5077         /* check cache first */
5078         for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
5079                 if (gr->chid_tlb[i].curr_ctx == curr_ctx) {
5080                         chid = gr->chid_tlb[i].hw_chid;
5081                         goto unlock;
5082                 }
5083         }
5084
5085         /* slow path */
5086         for (chid = 0; chid < f->num_channels; chid++)
5087                 if (f->channel[chid].in_use) {
5088                         if ((u32)(f->channel[chid].inst_block.cpu_pa >>
5089                                 ram_in_base_shift_v()) ==
5090                                 gr_fecs_current_ctx_ptr_v(curr_ctx))
5091                                 break;
5092         }
5093
5094         if (chid >= f->num_channels) {
5095                 chid = -1;
5096                 goto unlock;
5097         }
5098
5099         /* add to free tlb entry */
5100         for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
5101                 if (gr->chid_tlb[i].curr_ctx == 0) {
5102                         gr->chid_tlb[i].curr_ctx = curr_ctx;
5103                         gr->chid_tlb[i].hw_chid = chid;
5104                         goto unlock;
5105                 }
5106         }
5107
5108         /* no free entry, flush one */
5109         gr->chid_tlb[gr->channel_tlb_flush_index].curr_ctx = curr_ctx;
5110         gr->chid_tlb[gr->channel_tlb_flush_index].hw_chid = chid;
5111
5112         gr->channel_tlb_flush_index =
5113                 (gr->channel_tlb_flush_index + 1) &
5114                 (GR_CHANNEL_MAP_TLB_SIZE - 1);
5115
5116 unlock:
5117         spin_unlock(&gr->ch_tlb_lock);
5118         return chid;
5119 }
5120
5121 static int gk20a_gr_lock_down_sm(struct gk20a *g, u32 global_esr_mask)
5122 {
5123         unsigned long end_jiffies = jiffies +
5124                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
5125         u32 delay = GR_IDLE_CHECK_DEFAULT;
5126         bool mmu_debug_mode_enabled = gk20a_mm_mmu_debug_mode_enabled(g);
5127         u32 dbgr_control0;
5128
5129         gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "locking down SM");
5130
5131         /* assert stop trigger */
5132         dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
5133         dbgr_control0 |= gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
5134         gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r(), dbgr_control0);
5135
5136         /* wait for the sm to lock down */
5137         do {
5138                 u32 global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
5139                 u32 warp_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_warp_esr_r());
5140                 u32 dbgr_status0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_status0_r());
5141                 bool locked_down =
5142                         (gr_gpc0_tpc0_sm_dbgr_status0_locked_down_v(dbgr_status0) ==
5143                          gr_gpc0_tpc0_sm_dbgr_status0_locked_down_true_v());
5144                 bool error_pending =
5145                         (gr_gpc0_tpc0_sm_hww_warp_esr_error_v(warp_esr) !=
5146                          gr_gpc0_tpc0_sm_hww_warp_esr_error_none_v()) ||
5147                         ((global_esr & ~global_esr_mask) != 0);
5148
5149                 if (locked_down || !error_pending) {
5150                         gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "locked down SM");
5151
5152                         /* de-assert stop trigger */
5153                         dbgr_control0 &= ~gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
5154                         gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r(), dbgr_control0);
5155
5156                         return 0;
5157                 }
5158
5159                 /* if an mmu fault is pending and mmu debug mode is not
5160                  * enabled, the sm will never lock down. */
5161                 if (!mmu_debug_mode_enabled && gk20a_fifo_mmu_fault_pending(g)) {
5162                         gk20a_err(dev_from_gk20a(g), "mmu fault pending, sm will"
5163                                    " never lock down!");
5164                         return -EFAULT;
5165                 }
5166
5167                 usleep_range(delay, delay * 2);
5168                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
5169
5170         } while (time_before(jiffies, end_jiffies)
5171                         || !tegra_platform_is_silicon());
5172
5173         gk20a_err(dev_from_gk20a(g), "timed out while trying to lock down SM");
5174
5175         return -EAGAIN;
5176 }
5177
5178 bool gk20a_gr_sm_debugger_attached(struct gk20a *g)
5179 {
5180         u32 dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
5181
5182         /* check if an sm debugger is attached */
5183         if (gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_v(dbgr_control0) ==
5184                         gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_v())
5185                 return true;
5186
5187         return false;
5188 }
5189
5190 static void gk20a_gr_clear_sm_hww(struct gk20a *g, u32 global_esr)
5191 {
5192         gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r(), global_esr);
5193
5194         /* clear the warp hww */
5195         gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r(),
5196                         gr_gpc0_tpc0_sm_hww_warp_esr_error_none_f());
5197 }
5198
5199 static struct channel_gk20a *
5200 channel_from_hw_chid(struct gk20a *g, u32 hw_chid)
5201 {
5202         return g->fifo.channel+hw_chid;
5203 }
5204
5205 static int gk20a_gr_handle_sm_exception(struct gk20a *g,
5206                 struct gr_isr_data *isr_data)
5207 {
5208         int ret = 0;
5209         bool do_warp_sync = false;
5210         /* these three interrupts don't require locking down the SM. They can
5211          * be handled by usermode clients as they aren't fatal. Additionally,
5212          * usermode clients may wish to allow some warps to execute while others
5213          * are at breakpoints, as opposed to fatal errors where all warps should
5214          * halt. */
5215         u32 global_mask = gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f()   |
5216                           gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f() |
5217                           gr_gpc0_tpc0_sm_hww_global_esr_single_step_complete_pending_f();
5218         u32 global_esr, warp_esr;
5219         bool sm_debugger_attached = gk20a_gr_sm_debugger_attached(g);
5220         struct channel_gk20a *fault_ch;
5221
5222         gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
5223
5224         global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
5225         warp_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_warp_esr_r());
5226
5227         /* if an sm debugger is attached, disable forwarding of tpc exceptions.
5228          * the debugger will reenable exceptions after servicing them. */
5229         if (sm_debugger_attached) {
5230                 u32 tpc_exception_en = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r());
5231                 tpc_exception_en &= ~gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f();
5232                 gk20a_writel(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r(), tpc_exception_en);
5233                 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "SM debugger attached");
5234         }
5235
5236         /* if a debugger is present and an error has occurred, do a warp sync */
5237         if (sm_debugger_attached && ((warp_esr != 0) || ((global_esr & ~global_mask) != 0))) {
5238                 gk20a_dbg(gpu_dbg_intr, "warp sync needed");
5239                 do_warp_sync = true;
5240         }
5241
5242         if (do_warp_sync) {
5243                 ret = gk20a_gr_lock_down_sm(g, global_mask);
5244                 if (ret) {
5245                         gk20a_err(dev_from_gk20a(g), "sm did not lock down!\n");
5246                         return ret;
5247                 }
5248         }
5249
5250         /* finally, signal any client waiting on an event */
5251         fault_ch = channel_from_hw_chid(g, isr_data->chid);
5252         if (fault_ch)
5253                 gk20a_dbg_gpu_post_events(fault_ch);
5254
5255         return ret;
5256 }
5257
5258 static int gk20a_gr_handle_tpc_exception(struct gk20a *g,
5259                 struct gr_isr_data *isr_data)
5260 {
5261         int ret = 0;
5262         u32 tpc_exception = gk20a_readl(g, gr_gpcs_tpcs_tpccs_tpc_exception_r());
5263
5264         gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "");
5265
5266         /* check if an sm exeption is pending  */
5267         if (gr_gpcs_tpcs_tpccs_tpc_exception_sm_v(tpc_exception) ==
5268                         gr_gpcs_tpcs_tpccs_tpc_exception_sm_pending_v()) {
5269                 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "SM exception pending");
5270                 ret = gk20a_gr_handle_sm_exception(g, isr_data);
5271         }
5272
5273         return ret;
5274 }
5275
5276 static int gk20a_gr_handle_gpc_exception(struct gk20a *g,
5277                 struct gr_isr_data *isr_data)
5278 {
5279         int ret = 0;
5280         u32 gpc_exception = gk20a_readl(g, gr_gpcs_gpccs_gpc_exception_r());
5281
5282         gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "");
5283
5284         /* check if tpc 0 has an exception */
5285         if (gr_gpcs_gpccs_gpc_exception_tpc_v(gpc_exception) ==
5286                         gr_gpcs_gpccs_gpc_exception_tpc_0_pending_v()) {
5287                 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "TPC exception pending");
5288                 ret = gk20a_gr_handle_tpc_exception(g, isr_data);
5289         }
5290
5291         return ret;
5292 }
5293
5294 int gk20a_gr_isr(struct gk20a *g)
5295 {
5296         struct gr_isr_data isr_data;
5297         u32 grfifo_ctl;
5298         u32 obj_table;
5299         int need_reset = 0;
5300         u32 gr_intr = gk20a_readl(g, gr_intr_r());
5301
5302         gk20a_dbg_fn("");
5303         gk20a_dbg(gpu_dbg_intr, "pgraph intr %08x", gr_intr);
5304
5305         if (!gr_intr)
5306                 return 0;
5307
5308         grfifo_ctl = gk20a_readl(g, gr_gpfifo_ctl_r());
5309         grfifo_ctl &= ~gr_gpfifo_ctl_semaphore_access_f(1);
5310         grfifo_ctl &= ~gr_gpfifo_ctl_access_f(1);
5311
5312         gk20a_writel(g, gr_gpfifo_ctl_r(),
5313                 grfifo_ctl | gr_gpfifo_ctl_access_f(0) |
5314                 gr_gpfifo_ctl_semaphore_access_f(0));
5315
5316         isr_data.addr = gk20a_readl(g, gr_trapped_addr_r());
5317         isr_data.data_lo = gk20a_readl(g, gr_trapped_data_lo_r());
5318         isr_data.data_hi = gk20a_readl(g, gr_trapped_data_hi_r());
5319         isr_data.curr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
5320         isr_data.offset = gr_trapped_addr_mthd_v(isr_data.addr);
5321         isr_data.sub_chan = gr_trapped_addr_subch_v(isr_data.addr);
5322         obj_table = gk20a_readl(g,
5323                 gr_fe_object_table_r(isr_data.sub_chan));
5324         isr_data.class_num = gr_fe_object_table_nvclass_v(obj_table);
5325
5326         isr_data.chid =
5327                 gk20a_gr_get_chid_from_ctx(g, isr_data.curr_ctx);
5328         if (isr_data.chid == -1) {
5329                 gk20a_err(dev_from_gk20a(g), "invalid channel ctx 0x%08x",
5330                            isr_data.curr_ctx);
5331                 goto clean_up;
5332         }
5333
5334         gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5335                 "channel %d: addr 0x%08x, "
5336                 "data 0x%08x 0x%08x,"
5337                 "ctx 0x%08x, offset 0x%08x, "
5338                 "subchannel 0x%08x, class 0x%08x",
5339                 isr_data.chid, isr_data.addr,
5340                 isr_data.data_hi, isr_data.data_lo,
5341                 isr_data.curr_ctx, isr_data.offset,
5342                 isr_data.sub_chan, isr_data.class_num);
5343
5344         if (gr_intr & gr_intr_notify_pending_f()) {
5345                 gk20a_gr_handle_notify_pending(g, &isr_data);
5346                 gk20a_writel(g, gr_intr_r(),
5347                         gr_intr_notify_reset_f());
5348                 gr_intr &= ~gr_intr_notify_pending_f();
5349         }
5350
5351         if (gr_intr & gr_intr_semaphore_pending_f()) {
5352                 gk20a_gr_handle_semaphore_pending(g, &isr_data);
5353                 gk20a_writel(g, gr_intr_r(),
5354                         gr_intr_semaphore_reset_f());
5355                 gr_intr &= ~gr_intr_semaphore_pending_f();
5356         }
5357
5358         if (gr_intr & gr_intr_semaphore_timeout_pending_f()) {
5359                 need_reset |= gk20a_gr_handle_semaphore_timeout_pending(g,
5360                         &isr_data);
5361                 gk20a_writel(g, gr_intr_r(),
5362                         gr_intr_semaphore_reset_f());
5363                 gr_intr &= ~gr_intr_semaphore_pending_f();
5364         }
5365
5366         if (gr_intr & gr_intr_illegal_notify_pending_f()) {
5367                 need_reset |= gk20a_gr_intr_illegal_notify_pending(g,
5368                         &isr_data);
5369                 gk20a_writel(g, gr_intr_r(),
5370                         gr_intr_illegal_notify_reset_f());
5371                 gr_intr &= ~gr_intr_illegal_notify_pending_f();
5372         }
5373
5374         if (gr_intr & gr_intr_illegal_method_pending_f()) {
5375                 need_reset |= gk20a_gr_handle_illegal_method(g, &isr_data);
5376                 gk20a_writel(g, gr_intr_r(),
5377                         gr_intr_illegal_method_reset_f());
5378                 gr_intr &= ~gr_intr_illegal_method_pending_f();
5379         }
5380
5381         if (gr_intr & gr_intr_illegal_class_pending_f()) {
5382                 need_reset |= gk20a_gr_handle_illegal_class(g, &isr_data);
5383                 gk20a_writel(g, gr_intr_r(),
5384                         gr_intr_illegal_class_reset_f());
5385                 gr_intr &= ~gr_intr_illegal_class_pending_f();
5386         }
5387
5388         if (gr_intr & gr_intr_fecs_error_pending_f()) {
5389                 need_reset |= gk20a_gr_handle_fecs_error(g, &isr_data);
5390                 gk20a_writel(g, gr_intr_r(),
5391                         gr_intr_fecs_error_reset_f());
5392                 gr_intr &= ~gr_intr_fecs_error_pending_f();
5393         }
5394
5395         if (gr_intr & gr_intr_class_error_pending_f()) {
5396                 need_reset |= gk20a_gr_handle_class_error(g, &isr_data);
5397                 gk20a_writel(g, gr_intr_r(),
5398                         gr_intr_class_error_reset_f());
5399                 gr_intr &= ~gr_intr_class_error_pending_f();
5400         }
5401
5402         /* this one happens if someone tries to hit a non-whitelisted
5403          * register using set_falcon[4] */
5404         if (gr_intr & gr_intr_firmware_method_pending_f()) {
5405                 need_reset |= true;
5406                 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "firmware method intr pending\n");
5407                 gk20a_writel(g, gr_intr_r(),
5408                         gr_intr_firmware_method_reset_f());
5409                 gr_intr &= ~gr_intr_firmware_method_pending_f();
5410         }
5411
5412         if (gr_intr & gr_intr_exception_pending_f()) {
5413                 u32 exception = gk20a_readl(g, gr_exception_r());
5414                 struct fifo_gk20a *f = &g->fifo;
5415                 struct channel_gk20a *ch = &f->channel[isr_data.chid];
5416
5417                 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "exception %08x\n", exception);
5418
5419                 if (exception & gr_exception_fe_m()) {
5420                         u32 fe = gk20a_readl(g, gr_fe_hww_esr_r());
5421                         gk20a_dbg(gpu_dbg_intr, "fe warning %08x\n", fe);
5422                         gk20a_writel(g, gr_fe_hww_esr_r(), fe);
5423                         need_reset |= -EFAULT;
5424                 }
5425
5426                 /* check if a gpc exception has occurred */
5427                 if (exception & gr_exception_gpc_m() && need_reset == 0) {
5428                         u32 exception1 = gk20a_readl(g, gr_exception1_r());
5429                         u32 global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
5430
5431                         gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "GPC exception pending");
5432
5433                         /* if no sm debugger is present, clean up the channel */
5434                         if (!gk20a_gr_sm_debugger_attached(g)) {
5435                                 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5436                                            "SM debugger not attached, clearing interrupt");
5437                                 need_reset |= -EFAULT;
5438                         } else {
5439                                 /* check if gpc 0 has an exception */
5440                                 if (exception1 & gr_exception1_gpc_0_pending_f())
5441                                         need_reset |= gk20a_gr_handle_gpc_exception(g, &isr_data);
5442                                 /* clear the hwws, also causes tpc and gpc
5443                                  * exceptions to be cleared */
5444                                 gk20a_gr_clear_sm_hww(g, global_esr);
5445                         }
5446
5447                         if (need_reset)
5448                                 gk20a_set_error_notifier(ch,
5449                                         NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
5450                 }
5451
5452                 gk20a_writel(g, gr_intr_r(), gr_intr_exception_reset_f());
5453                 gr_intr &= ~gr_intr_exception_pending_f();
5454         }
5455
5456         if (need_reset)
5457                 gk20a_fifo_recover(g, BIT(ENGINE_GR_GK20A), true);
5458
5459 clean_up:
5460         gk20a_writel(g, gr_gpfifo_ctl_r(),
5461                 grfifo_ctl | gr_gpfifo_ctl_access_f(1) |
5462                 gr_gpfifo_ctl_semaphore_access_f(1));
5463
5464         if (gr_intr)
5465                 gk20a_err(dev_from_gk20a(g),
5466                            "unhandled gr interrupt 0x%08x", gr_intr);
5467
5468         return 0;
5469 }
5470
5471 int gk20a_gr_nonstall_isr(struct gk20a *g)
5472 {
5473         u32 gr_intr = gk20a_readl(g, gr_intr_nonstall_r());
5474         u32 clear_intr = 0;
5475
5476         gk20a_dbg(gpu_dbg_intr, "pgraph nonstall intr %08x", gr_intr);
5477
5478         if (gr_intr & gr_intr_nonstall_trap_pending_f()) {
5479                 gk20a_channel_semaphore_wakeup(g);
5480                 clear_intr |= gr_intr_nonstall_trap_pending_f();
5481         }
5482
5483         gk20a_writel(g, gr_intr_nonstall_r(), clear_intr);
5484
5485         return 0;
5486 }
5487
5488 int gr_gk20a_fecs_get_reglist_img_size(struct gk20a *g, u32 *size)
5489 {
5490         BUG_ON(size == NULL);
5491         return gr_gk20a_submit_fecs_method_op(g,
5492                    (struct fecs_method_op_gk20a) {
5493                            .mailbox.id = 0,
5494                            .mailbox.data = 0,
5495                            .mailbox.clr = ~0,
5496                            .method.data = 1,
5497                            .method.addr = gr_fecs_method_push_adr_discover_reglist_image_size_v(),
5498                            .mailbox.ret = size,
5499                            .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
5500                            .mailbox.ok = 0,
5501                            .cond.fail = GR_IS_UCODE_OP_SKIP,
5502                            .mailbox.fail = 0});
5503 }
5504
5505 int gr_gk20a_fecs_set_reglist_bind_inst(struct gk20a *g, phys_addr_t addr)
5506 {
5507         return gr_gk20a_submit_fecs_method_op(g,
5508                    (struct fecs_method_op_gk20a){
5509                            .mailbox.id = 4,
5510                            .mailbox.data = (gr_fecs_current_ctx_ptr_f(addr >> 12) |
5511                                             gr_fecs_current_ctx_valid_f(1) |
5512                                             gr_fecs_current_ctx_target_vid_mem_f()),
5513                            .mailbox.clr = ~0,
5514                            .method.data = 1,
5515                            .method.addr = gr_fecs_method_push_adr_set_reglist_bind_instance_v(),
5516                            .mailbox.ret = NULL,
5517                            .cond.ok = GR_IS_UCODE_OP_EQUAL,
5518                            .mailbox.ok = 1,
5519                            .cond.fail = GR_IS_UCODE_OP_SKIP,
5520                            .mailbox.fail = 0});
5521 }
5522
5523 int gr_gk20a_fecs_set_reglist_virtual_addr(struct gk20a *g, u64 pmu_va)
5524 {
5525         return gr_gk20a_submit_fecs_method_op(g,
5526                    (struct fecs_method_op_gk20a) {
5527                            .mailbox.id = 4,
5528                            .mailbox.data = u64_lo32(pmu_va >> 8),
5529                            .mailbox.clr = ~0,
5530                            .method.data = 1,
5531                            .method.addr = gr_fecs_method_push_adr_set_reglist_virtual_address_v(),
5532                            .mailbox.ret = NULL,
5533                            .cond.ok = GR_IS_UCODE_OP_EQUAL,
5534                            .mailbox.ok = 1,
5535                            .cond.fail = GR_IS_UCODE_OP_SKIP,
5536                            .mailbox.fail = 0});
5537 }
5538
5539 int gk20a_gr_suspend(struct gk20a *g)
5540 {
5541         unsigned long end_jiffies = jiffies +
5542                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
5543         u32 ret = 0;
5544
5545         gk20a_dbg_fn("");
5546
5547         ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
5548         if (ret)
5549                 return ret;
5550
5551         gk20a_writel(g, gr_gpfifo_ctl_r(),
5552                 gr_gpfifo_ctl_access_disabled_f());
5553
5554         /* disable gr intr */
5555         gk20a_writel(g, gr_intr_r(), 0);
5556         gk20a_writel(g, gr_intr_en_r(), 0);
5557
5558         /* disable all exceptions */
5559         gk20a_writel(g, gr_exception_r(), 0);
5560         gk20a_writel(g, gr_exception_en_r(), 0);
5561         gk20a_writel(g, gr_exception1_r(), 0);
5562         gk20a_writel(g, gr_exception1_en_r(), 0);
5563         gk20a_writel(g, gr_exception2_r(), 0);
5564         gk20a_writel(g, gr_exception2_en_r(), 0);
5565
5566         gk20a_gr_flush_channel_tlb(&g->gr);
5567
5568         g->gr.initialized = false;
5569
5570         gk20a_dbg_fn("done");
5571         return ret;
5572 }
5573
5574 static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
5575                                                u32 addr,
5576                                                bool is_quad, u32 quad,
5577                                                u32 *context_buffer,
5578                                                u32 context_buffer_size,
5579                                                u32 *priv_offset);
5580
5581 /* This function will decode a priv address and return the partition type and numbers. */
5582 int gr_gk20a_decode_priv_addr(struct gk20a *g, u32 addr,
5583                               int  *addr_type, /* enum ctxsw_addr_type */
5584                               u32 *gpc_num, u32 *tpc_num, u32 *ppc_num, u32 *be_num,
5585                               u32 *broadcast_flags)
5586 {
5587         u32 gpc_addr;
5588         u32 ppc_address;
5589         u32 ppc_broadcast_addr;
5590
5591         gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
5592
5593         /* setup defaults */
5594         ppc_address = 0;
5595         ppc_broadcast_addr = 0;
5596         *addr_type = CTXSW_ADDR_TYPE_SYS;
5597         *broadcast_flags = PRI_BROADCAST_FLAGS_NONE;
5598         *gpc_num = 0;
5599         *tpc_num = 0;
5600         *ppc_num = 0;
5601         *be_num  = 0;
5602
5603         if (pri_is_gpc_addr(addr)) {
5604                 *addr_type = CTXSW_ADDR_TYPE_GPC;
5605                 gpc_addr = pri_gpccs_addr_mask(addr);
5606                 if (pri_is_gpc_addr_shared(addr)) {
5607                         *addr_type = CTXSW_ADDR_TYPE_GPC;
5608                         *broadcast_flags |= PRI_BROADCAST_FLAGS_GPC;
5609                 } else
5610                         *gpc_num = pri_get_gpc_num(addr);
5611
5612                 if (pri_is_tpc_addr(gpc_addr)) {
5613                         *addr_type = CTXSW_ADDR_TYPE_TPC;
5614                         if (pri_is_tpc_addr_shared(gpc_addr)) {
5615                                 *broadcast_flags |= PRI_BROADCAST_FLAGS_TPC;
5616                                 return 0;
5617                         }
5618                         *tpc_num = pri_get_tpc_num(gpc_addr);
5619                 }
5620                 return 0;
5621         } else if (pri_is_be_addr(addr)) {
5622                 *addr_type = CTXSW_ADDR_TYPE_BE;
5623                 if (pri_is_be_addr_shared(addr)) {
5624                         *broadcast_flags |= PRI_BROADCAST_FLAGS_BE;
5625                         return 0;
5626                 }
5627                 *be_num = pri_get_be_num(addr);
5628                 return 0;
5629         } else {
5630                 *addr_type = CTXSW_ADDR_TYPE_SYS;
5631                 return 0;
5632         }
5633         /* PPC!?!?!?! */
5634
5635         /*NOTREACHED*/
5636         return -EINVAL;
5637 }
5638
5639 static int gr_gk20a_split_ppc_broadcast_addr(struct gk20a *g, u32 addr,
5640                                       u32 gpc_num,
5641                                       u32 *priv_addr_table, u32 *t)
5642 {
5643     u32 ppc_num;
5644
5645     gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
5646
5647     for (ppc_num = 0; ppc_num < g->gr.pe_count_per_gpc; ppc_num++)
5648             priv_addr_table[(*t)++] = pri_ppc_addr(pri_ppccs_addr_mask(addr),
5649                                                    gpc_num, ppc_num);
5650
5651     return 0;
5652 }
5653
5654 /*
5655  * The context buffer is indexed using BE broadcast addresses and GPC/TPC
5656  * unicast addresses. This function will convert a BE unicast address to a BE
5657  * broadcast address and split a GPC/TPC broadcast address into a table of
5658  * GPC/TPC addresses.  The addresses generated by this function can be
5659  * successfully processed by gr_gk20a_find_priv_offset_in_buffer
5660  */
5661 static int gr_gk20a_create_priv_addr_table(struct gk20a *g,
5662                                            u32 addr,
5663                                            u32 *priv_addr_table,
5664                                            u32 *num_registers)
5665 {
5666         int addr_type; /*enum ctxsw_addr_type */
5667         u32 gpc_num, tpc_num, ppc_num, be_num;
5668         u32 broadcast_flags;
5669         u32 t;
5670         int err;
5671
5672         t = 0;
5673         *num_registers = 0;
5674
5675         gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
5676
5677         err = gr_gk20a_decode_priv_addr(g, addr, &addr_type,
5678                                         &gpc_num, &tpc_num, &ppc_num, &be_num,
5679                                         &broadcast_flags);
5680         gk20a_dbg(gpu_dbg_gpu_dbg, "addr_type = %d", addr_type);
5681         if (err)
5682                 return err;
5683
5684         if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
5685             (addr_type == CTXSW_ADDR_TYPE_BE)) {
5686                 /* The BE broadcast registers are included in the compressed PRI
5687                  * table. Convert a BE unicast address to a broadcast address
5688                  * so that we can look up the offset. */
5689                 if ((addr_type == CTXSW_ADDR_TYPE_BE) &&
5690                     !(broadcast_flags & PRI_BROADCAST_FLAGS_BE))
5691                         priv_addr_table[t++] = pri_be_shared_addr(addr);
5692                 else
5693                         priv_addr_table[t++] = addr;
5694
5695                 *num_registers = t;
5696                 return 0;
5697         }
5698
5699         /* The GPC/TPC unicast registers are included in the compressed PRI
5700          * tables. Convert a GPC/TPC broadcast address to unicast addresses so
5701          * that we can look up the offsets. */
5702         if (broadcast_flags & PRI_BROADCAST_FLAGS_GPC) {
5703                 for (gpc_num = 0; gpc_num < g->gr.gpc_count; gpc_num++) {
5704
5705                         if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
5706                                 for (tpc_num = 0;
5707                                      tpc_num < g->gr.gpc_tpc_count[gpc_num];
5708                                      tpc_num++)
5709                                         priv_addr_table[t++] =
5710                                                 pri_tpc_addr(pri_tpccs_addr_mask(addr),
5711                                                              gpc_num, tpc_num);
5712
5713                         else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC) {
5714                                 err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
5715                                                                priv_addr_table, &t);
5716                                 if (err)
5717                                         return err;
5718                         } else
5719                                 priv_addr_table[t++] =
5720                                         pri_gpc_addr(pri_gpccs_addr_mask(addr),
5721                                                      gpc_num);
5722                 }
5723         } else {
5724                 if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
5725                         for (tpc_num = 0;
5726                              tpc_num < g->gr.gpc_tpc_count[gpc_num];
5727                              tpc_num++)
5728                                 priv_addr_table[t++] =
5729                                         pri_tpc_addr(pri_tpccs_addr_mask(addr),
5730                                                      gpc_num, tpc_num);
5731                 else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC)
5732                         err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
5733                                                        priv_addr_table, &t);
5734                 else
5735                         priv_addr_table[t++] = addr;
5736         }
5737
5738         *num_registers = t;
5739         return 0;
5740 }
5741
5742 int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
5743                                     u32 addr,
5744                                     u32 max_offsets,
5745                                     u32 *offsets, u32 *offset_addrs,
5746                                     u32 *num_offsets,
5747                                     bool is_quad, u32 quad)
5748 {
5749         u32 i;
5750         u32 priv_offset = 0;
5751         u32 *priv_registers;
5752         u32 num_registers = 0;
5753         int err = 0;
5754         u32 potential_offsets = proj_scal_litter_num_gpcs_v() *
5755                 proj_scal_litter_num_tpc_per_gpc_v();
5756
5757         gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
5758
5759         /* implementation is crossed-up if either of these happen */
5760         if (max_offsets > potential_offsets)
5761                 return -EINVAL;
5762
5763         if (!g->gr.ctx_vars.golden_image_initialized)
5764                 return -ENODEV;
5765
5766         priv_registers = kzalloc(sizeof(u32) * potential_offsets, GFP_KERNEL);
5767         if (IS_ERR_OR_NULL(priv_registers)) {
5768                 gk20a_dbg_fn("failed alloc for potential_offsets=%d", potential_offsets);
5769                 err = PTR_ERR(priv_registers);
5770                 goto cleanup;
5771         }
5772         memset(offsets,      0, sizeof(u32) * max_offsets);
5773         memset(offset_addrs, 0, sizeof(u32) * max_offsets);
5774         *num_offsets = 0;
5775
5776         gr_gk20a_create_priv_addr_table(g, addr, &priv_registers[0], &num_registers);
5777
5778         if ((max_offsets > 1) && (num_registers > max_offsets)) {
5779                 err = -EINVAL;
5780                 goto cleanup;
5781         }
5782
5783         if ((max_offsets == 1) && (num_registers > 1))
5784                 num_registers = 1;
5785
5786         if (!g->gr.ctx_vars.local_golden_image) {
5787                 gk20a_dbg_fn("no context switch header info to work with");
5788                 err = -EINVAL;
5789                 goto cleanup;
5790         }
5791
5792         for (i = 0; i < num_registers; i++) {
5793                 err = gr_gk20a_find_priv_offset_in_buffer(g,
5794                                                   priv_registers[i],
5795                                                   is_quad, quad,
5796                                                   g->gr.ctx_vars.local_golden_image,
5797                                                   g->gr.ctx_vars.golden_image_size,
5798                                                   &priv_offset);
5799                 if (err) {
5800                         gk20a_dbg_fn("Could not determine priv_offset for addr:0x%x",
5801                                       addr); /*, grPriRegStr(addr)));*/
5802                         goto cleanup;
5803                 }
5804
5805                 offsets[i] = priv_offset;
5806                 offset_addrs[i] = priv_registers[i];
5807         }
5808
5809     *num_offsets = num_registers;
5810
5811  cleanup:
5812
5813     if (!IS_ERR_OR_NULL(priv_registers))
5814             kfree(priv_registers);
5815
5816     return err;
5817 }
5818
5819 /* Setup some register tables.  This looks hacky; our
5820  * register/offset functions are just that, functions.
5821  * So they can't be used as initializers... TBD: fix to
5822  * generate consts at least on an as-needed basis.
5823  */
5824 static const u32 _num_ovr_perf_regs = 17;
5825 static u32 _ovr_perf_regs[17] = { 0, };
5826 /* Following are the blocks of registers that the ucode
5827  stores in the extended region.*/
5828 /* ==  ctxsw_extended_sm_dsm_perf_counter_register_stride_v() ? */
5829 static const u32 _num_sm_dsm_perf_regs = 5;
5830 /* ==  ctxsw_extended_sm_dsm_perf_counter_control_register_stride_v() ?*/
5831 static const u32 _num_sm_dsm_perf_ctrl_regs = 4;
5832 static u32 _sm_dsm_perf_regs[5];
5833 static u32 _sm_dsm_perf_ctrl_regs[4];
5834
5835 static void init_sm_dsm_reg_info(void)
5836 {
5837         if (_ovr_perf_regs[0] != 0)
5838                 return;
5839
5840         _ovr_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel0_r();
5841         _ovr_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel1_r();
5842         _ovr_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control0_r();
5843         _ovr_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control5_r();
5844         _ovr_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status1_r();
5845         _ovr_perf_regs[5] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_control_r();
5846         _ovr_perf_regs[6] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_control_r();
5847         _ovr_perf_regs[7] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_control_r();
5848         _ovr_perf_regs[8] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_control_r();
5849         _ovr_perf_regs[9] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_control_r();
5850         _ovr_perf_regs[10] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_control_r();
5851         _ovr_perf_regs[11] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_control_r();
5852         _ovr_perf_regs[12] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_control_r();
5853         _ovr_perf_regs[13] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_r();
5854         _ovr_perf_regs[14] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_r();
5855         _ovr_perf_regs[15] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_r();
5856         _ovr_perf_regs[16] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_r();
5857
5858
5859         _sm_dsm_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status_r();
5860         _sm_dsm_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_r();
5861         _sm_dsm_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_r();
5862         _sm_dsm_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_r();
5863         _sm_dsm_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_r();
5864
5865         _sm_dsm_perf_ctrl_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control1_r();
5866         _sm_dsm_perf_ctrl_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control2_r();
5867         _sm_dsm_perf_ctrl_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control3_r();
5868         _sm_dsm_perf_ctrl_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control4_r();
5869
5870 }
5871
5872 /* TBD: would like to handle this elsewhere, at a higher level.
5873  * these are currently constructed in a "test-then-write" style
5874  * which makes it impossible to know externally whether a ctx
5875  * write will actually occur. so later we should put a lazy,
5876  *  map-and-hold system in the patch write state */
5877 int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
5878                             struct channel_ctx_gk20a *ch_ctx,
5879                             u32 addr, u32 data,
5880                             u8 *context)
5881 {
5882         u32 num_gpc = g->gr.gpc_count;
5883         u32 num_tpc;
5884         u32 tpc, gpc, reg;
5885         u32 chk_addr;
5886         u32 vaddr_lo;
5887         u32 vaddr_hi;
5888         u32 tmp;
5889
5890         init_sm_dsm_reg_info();
5891
5892         gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
5893
5894         for (reg = 0; reg < _num_ovr_perf_regs; reg++) {
5895                 for (gpc = 0; gpc < num_gpc; gpc++)  {
5896                         num_tpc = g->gr.gpc_tpc_count[gpc];
5897                         for (tpc = 0; tpc < num_tpc; tpc++) {
5898                                 chk_addr = ((proj_gpc_stride_v() * gpc) +
5899                                             (proj_tpc_in_gpc_stride_v() * tpc) +
5900                                             _ovr_perf_regs[reg]);
5901                                 if (chk_addr != addr)
5902                                         continue;
5903                                 /* reset the patch count from previous
5904                                    runs,if ucode has already processed
5905                                    it */
5906                                 tmp = gk20a_mem_rd32(context +
5907                                        ctxsw_prog_main_image_patch_count_o(), 0);
5908
5909                                 if (!tmp)
5910                                         ch_ctx->patch_ctx.data_count = 0;
5911
5912                                 gr_gk20a_ctx_patch_write(g, ch_ctx,
5913                                                          addr, data, true);
5914
5915                                 vaddr_lo = u64_lo32(ch_ctx->patch_ctx.gpu_va);
5916                                 vaddr_hi = u64_hi32(ch_ctx->patch_ctx.gpu_va);
5917
5918                                 gk20a_mem_wr32(context +
5919                                          ctxsw_prog_main_image_patch_count_o(),
5920                                          0, ch_ctx->patch_ctx.data_count);
5921                                 gk20a_mem_wr32(context +
5922                                          ctxsw_prog_main_image_patch_adr_lo_o(),
5923                                          0, vaddr_lo);
5924                                 gk20a_mem_wr32(context +
5925                                          ctxsw_prog_main_image_patch_adr_hi_o(),
5926                                          0, vaddr_hi);
5927
5928                                 /* we're not caching these on cpu side,
5929                                    but later watch for it */
5930                                 return 0;
5931                         }
5932                 }
5933         }
5934
5935         return 0;
5936 }
5937
5938 static void gr_gk20a_access_smpc_reg(struct gk20a *g, u32 quad, u32 offset)
5939 {
5940         u32 reg;
5941         u32 quad_ctrl;
5942         u32 half_ctrl;
5943         u32 tpc, gpc;
5944         u32 gpc_tpc_addr;
5945         u32 gpc_tpc_stride;
5946
5947         gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "offset=0x%x", offset);
5948
5949         gpc = pri_get_gpc_num(offset);
5950         gpc_tpc_addr = pri_gpccs_addr_mask(offset);
5951         tpc = pri_get_tpc_num(gpc_tpc_addr);
5952
5953         quad_ctrl = quad & 0x1; /* first bit tells us quad */
5954         half_ctrl = (quad >> 1) & 0x1; /* second bit tells us half */
5955
5956         gpc_tpc_stride = gpc * proj_gpc_stride_v() +
5957                 tpc * proj_tpc_in_gpc_stride_v();
5958         gpc_tpc_addr = gr_gpc0_tpc0_sm_halfctl_ctrl_r() + gpc_tpc_stride;
5959
5960         reg = gk20a_readl(g, gpc_tpc_addr);
5961         reg = set_field(reg,
5962                 gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_read_quad_ctl_m(),
5963                 gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_read_quad_ctl_f(quad_ctrl));
5964
5965         gk20a_writel(g, gpc_tpc_addr, reg);
5966
5967         gpc_tpc_addr = gr_gpc0_tpc0_sm_debug_sfe_control_r() + gpc_tpc_stride;
5968         reg = gk20a_readl(g, gpc_tpc_addr);
5969         reg = set_field(reg,
5970                 gr_gpcs_tpcs_sm_debug_sfe_control_read_half_ctl_m(),
5971                 gr_gpcs_tpcs_sm_debug_sfe_control_read_half_ctl_f(half_ctrl));
5972         gk20a_writel(g, gpc_tpc_addr, reg);
5973 }
5974
5975 #define ILLEGAL_ID (~0)
5976
5977 static inline bool check_main_image_header_magic(void *context)
5978 {
5979         u32 magic = gk20a_mem_rd32(context +
5980                              ctxsw_prog_main_image_magic_value_o(), 0);
5981         gk20a_dbg(gpu_dbg_gpu_dbg, "main image magic=0x%x", magic);
5982         return magic == ctxsw_prog_main_image_magic_value_v_value_v();
5983 }
5984 static inline bool check_local_header_magic(void *context)
5985 {
5986         u32 magic = gk20a_mem_rd32(context +
5987                              ctxsw_prog_local_magic_value_o(), 0);
5988         gk20a_dbg(gpu_dbg_gpu_dbg, "local magic=0x%x",  magic);
5989         return magic == ctxsw_prog_local_magic_value_v_value_v();
5990
5991 }
5992
5993 /* most likely dupe of ctxsw_gpccs_header__size_1_v() */
5994 static inline int ctxsw_prog_ucode_header_size_in_bytes(void)
5995 {
5996         return 256;
5997 }
5998
5999 void gr_gk20a_get_sm_dsm_perf_regs(struct gk20a *g,
6000                                         u32 *num_sm_dsm_perf_regs,
6001                                         u32 **sm_dsm_perf_regs,
6002                                         u32 *perf_register_stride)
6003 {
6004         *num_sm_dsm_perf_regs = _num_sm_dsm_perf_regs;
6005         *sm_dsm_perf_regs = _sm_dsm_perf_regs;
6006         *perf_register_stride = ctxsw_prog_extended_sm_dsm_perf_counter_register_stride_v();
6007 }
6008
6009 void gr_gk20a_get_sm_dsm_perf_ctrl_regs(struct gk20a *g,
6010                                         u32 *num_sm_dsm_perf_ctrl_regs,
6011                                         u32 **sm_dsm_perf_ctrl_regs,
6012                                         u32 *ctrl_register_stride)
6013 {
6014         *num_sm_dsm_perf_ctrl_regs = _num_sm_dsm_perf_ctrl_regs;
6015         *sm_dsm_perf_ctrl_regs = _sm_dsm_perf_ctrl_regs;
6016         *ctrl_register_stride = ctxsw_prog_extended_sm_dsm_perf_counter_control_register_stride_v();
6017 }
6018
6019 static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
6020                                                    u32 addr,
6021                                                    bool is_quad, u32 quad,
6022                                                    u32 *context_buffer,
6023                                                    u32 context_buffer_size,
6024                                                    u32 *priv_offset)
6025 {
6026         u32 i, data32;
6027         u32 gpc_num, tpc_num;
6028         u32 num_gpcs, num_tpcs;
6029         u32 chk_addr;
6030         u32 ext_priv_offset, ext_priv_size;
6031         void *context;
6032         u32 offset_to_segment, offset_to_segment_end;
6033         u32 sm_dsm_perf_reg_id = ILLEGAL_ID;
6034         u32 sm_dsm_perf_ctrl_reg_id = ILLEGAL_ID;
6035         u32 num_ext_gpccs_ext_buffer_segments;
6036         u32 inter_seg_offset;
6037         u32 tpc_gpc_mask = (proj_tpc_in_gpc_stride_v() - 1);
6038         u32 max_tpc_count;
6039         u32 *sm_dsm_perf_ctrl_regs = NULL;
6040         u32 num_sm_dsm_perf_ctrl_regs = 0;
6041         u32 *sm_dsm_perf_regs = NULL;
6042         u32 num_sm_dsm_perf_regs = 0;
6043         u32 buffer_segments_size = 0;
6044         u32 marker_size = 0;
6045         u32 control_register_stride = 0;
6046         u32 perf_register_stride = 0;
6047
6048         /* Only have TPC registers in extended region, so if not a TPC reg,
6049            then return error so caller can look elsewhere. */
6050         if (pri_is_gpc_addr(addr))   {
6051                 u32 gpc_addr = 0;
6052                 gpc_num = pri_get_gpc_num(addr);
6053                 gpc_addr = pri_gpccs_addr_mask(addr);
6054                 if (pri_is_tpc_addr(gpc_addr))
6055                         tpc_num = pri_get_tpc_num(gpc_addr);
6056                 else
6057                         return -EINVAL;
6058
6059                 gk20a_dbg_info(" gpc = %d tpc = %d",
6060                                 gpc_num, tpc_num);
6061         } else
6062                 return -EINVAL;
6063
6064         buffer_segments_size = ctxsw_prog_extended_buffer_segments_size_in_bytes_v();
6065         /* note below is in words/num_registers */
6066         marker_size = ctxsw_prog_extended_marker_size_in_bytes_v() >> 2;
6067
6068         context = context_buffer;
6069         /* sanity check main header */
6070         if (!check_main_image_header_magic(context)) {
6071                 gk20a_err(dev_from_gk20a(g),
6072                            "Invalid main header: magic value");
6073                 return -EINVAL;
6074         }
6075         num_gpcs = gk20a_mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0);
6076         if (gpc_num >= num_gpcs) {
6077                 gk20a_err(dev_from_gk20a(g),
6078                    "GPC 0x%08x is greater than total count 0x%08x!\n",
6079                            gpc_num, num_gpcs);
6080                 return -EINVAL;
6081         }
6082
6083         data32 = gk20a_mem_rd32(context + ctxsw_prog_main_extended_buffer_ctl_o(), 0);
6084         ext_priv_size   = ctxsw_prog_main_extended_buffer_ctl_size_v(data32);
6085         if (0 == ext_priv_size) {
6086                 gk20a_dbg_info(" No extended memory in context buffer");
6087                 return -EINVAL;
6088         }
6089         ext_priv_offset = ctxsw_prog_main_extended_buffer_ctl_offset_v(data32);
6090
6091         offset_to_segment = ext_priv_offset * ctxsw_prog_ucode_header_size_in_bytes();
6092         offset_to_segment_end = offset_to_segment +
6093                 (ext_priv_size * buffer_segments_size);
6094
6095         /* check local header magic */
6096         context += ctxsw_prog_ucode_header_size_in_bytes();
6097         if (!check_local_header_magic(context)) {
6098                 gk20a_err(dev_from_gk20a(g),
6099                            "Invalid local header: magic value\n");
6100                 return -EINVAL;
6101         }
6102
6103         /*
6104          * See if the incoming register address is in the first table of
6105          * registers. We check this by decoding only the TPC addr portion.
6106          * If we get a hit on the TPC bit, we then double check the address
6107          * by computing it from the base gpc/tpc strides.  Then make sure
6108          * it is a real match.
6109          */
6110         g->ops.gr.get_sm_dsm_perf_regs(g, &num_sm_dsm_perf_regs,
6111                                        &sm_dsm_perf_regs,
6112                                        &perf_register_stride);
6113
6114         init_sm_dsm_reg_info();
6115
6116         for (i = 0; i < num_sm_dsm_perf_regs; i++) {
6117                 if ((addr & tpc_gpc_mask) == (sm_dsm_perf_regs[i] & tpc_gpc_mask)) {
6118                         sm_dsm_perf_reg_id = i;
6119
6120                         gk20a_dbg_info("register match: 0x%08x",
6121                                         sm_dsm_perf_regs[i]);
6122
6123                         chk_addr = (proj_gpc_base_v() +
6124                                    (proj_gpc_stride_v() * gpc_num) +
6125                                    proj_tpc_in_gpc_base_v() +
6126                                    (proj_tpc_in_gpc_stride_v() * tpc_num) +
6127                                    (sm_dsm_perf_regs[sm_dsm_perf_reg_id] & tpc_gpc_mask));
6128
6129                         if (chk_addr != addr) {
6130                                 gk20a_err(dev_from_gk20a(g),
6131                                    "Oops addr miss-match! : 0x%08x != 0x%08x\n",
6132                                            addr, chk_addr);
6133                                 return -EINVAL;
6134                         }
6135                         break;
6136                 }
6137         }
6138
6139         /* Didn't find reg in supported group 1.
6140          *  so try the second group now */
6141         g->ops.gr.get_sm_dsm_perf_ctrl_regs(g, &num_sm_dsm_perf_ctrl_regs,
6142                                        &sm_dsm_perf_ctrl_regs,
6143                                        &control_register_stride);
6144
6145         if (ILLEGAL_ID == sm_dsm_perf_reg_id) {
6146                 for (i = 0; i < num_sm_dsm_perf_ctrl_regs; i++) {
6147                         if ((addr & tpc_gpc_mask) ==
6148                             (sm_dsm_perf_ctrl_regs[i] & tpc_gpc_mask)) {
6149                                 sm_dsm_perf_ctrl_reg_id = i;
6150
6151                                 gk20a_dbg_info("register match: 0x%08x",
6152                                                 sm_dsm_perf_ctrl_regs[i]);
6153
6154                                 chk_addr = (proj_gpc_base_v() +
6155                                            (proj_gpc_stride_v() * gpc_num) +
6156                                            proj_tpc_in_gpc_base_v() +
6157                                            (proj_tpc_in_gpc_stride_v() * tpc_num) +
6158                                            (sm_dsm_perf_ctrl_regs[sm_dsm_perf_ctrl_reg_id] &
6159                                             tpc_gpc_mask));
6160
6161                                 if (chk_addr != addr) {
6162                                         gk20a_err(dev_from_gk20a(g),
6163                                                    "Oops addr miss-match! : 0x%08x != 0x%08x\n",
6164                                                    addr, chk_addr);
6165                                         return -EINVAL;
6166
6167                                 }
6168
6169                                 break;
6170                         }
6171                 }
6172         }
6173
6174         if ((ILLEGAL_ID == sm_dsm_perf_ctrl_reg_id) &&
6175             (ILLEGAL_ID == sm_dsm_perf_reg_id))
6176                 return -EINVAL;
6177
6178         /* Skip the FECS extended header, nothing there for us now. */
6179         offset_to_segment += buffer_segments_size;
6180
6181         /* skip through the GPCCS extended headers until we get to the data for
6182          * our GPC.  The size of each gpc extended segment is enough to hold the
6183          * max tpc count for the gpcs,in 256b chunks.
6184          */
6185
6186         max_tpc_count = proj_scal_litter_num_tpc_per_gpc_v();
6187
6188         num_ext_gpccs_ext_buffer_segments = (u32)((max_tpc_count + 1) / 2);
6189
6190         offset_to_segment += (num_ext_gpccs_ext_buffer_segments *
6191                               buffer_segments_size * gpc_num);
6192
6193         num_tpcs = g->gr.gpc_tpc_count[gpc_num];
6194
6195         /* skip the head marker to start with */
6196         inter_seg_offset = marker_size;
6197
6198         if (ILLEGAL_ID != sm_dsm_perf_ctrl_reg_id) {
6199                 /* skip over control regs of TPC's before the one we want.
6200                  *  then skip to the register in this tpc */
6201                 inter_seg_offset = inter_seg_offset +
6202                         (tpc_num * control_register_stride) +
6203                         sm_dsm_perf_ctrl_reg_id;
6204         } else {
6205                 /* skip all the control registers */
6206                 inter_seg_offset = inter_seg_offset +
6207                         (num_tpcs * control_register_stride);
6208
6209                 /* skip the marker between control and counter segments */
6210                 inter_seg_offset += marker_size;
6211
6212                 /* skip over counter regs of TPCs before the one we want */
6213                 inter_seg_offset = inter_seg_offset +
6214                         (tpc_num * perf_register_stride) *
6215                         ctxsw_prog_extended_num_smpc_quadrants_v();
6216
6217                 /* skip over the register for the quadrants we do not want.
6218                  *  then skip to the register in this tpc */
6219                 inter_seg_offset = inter_seg_offset +
6220                         (perf_register_stride * quad) +
6221                         sm_dsm_perf_reg_id;
6222         }
6223
6224         /* set the offset to the segment offset plus the inter segment offset to
6225          *  our register */
6226         offset_to_segment += (inter_seg_offset * 4);
6227
6228         /* last sanity check: did we somehow compute an offset outside the
6229          * extended buffer? */
6230         if (offset_to_segment > offset_to_segment_end) {
6231                 gk20a_err(dev_from_gk20a(g),
6232                            "Overflow ctxsw buffer! 0x%08x > 0x%08x\n",
6233                            offset_to_segment, offset_to_segment_end);
6234                 return -EINVAL;
6235         }
6236
6237         *priv_offset = offset_to_segment;
6238
6239         return 0;
6240 }
6241
6242
6243 static int
6244 gr_gk20a_process_context_buffer_priv_segment(struct gk20a *g,
6245                                              int addr_type,/* enum ctxsw_addr_type */
6246                                              u32 pri_addr,
6247                                              u32 gpc_num, u32 num_tpcs,
6248                                              u32 num_ppcs, u32 ppc_mask,
6249                                              u32 *priv_offset)
6250 {
6251         u32 i;
6252         u32 address, base_address;
6253         u32 sys_offset, gpc_offset, tpc_offset, ppc_offset;
6254         u32 ppc_num, tpc_num, tpc_addr, gpc_addr, ppc_addr;
6255         struct aiv_gk20a *reg;
6256
6257         gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "pri_addr=0x%x", pri_addr);
6258
6259         if (!g->gr.ctx_vars.valid)
6260                 return -EINVAL;
6261
6262         /* Process the SYS/BE segment. */
6263         if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
6264             (addr_type == CTXSW_ADDR_TYPE_BE)) {
6265                 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.sys.count; i++) {
6266                         reg = &g->gr.ctx_vars.ctxsw_regs.sys.l[i];
6267                         address    = reg->addr;
6268                         sys_offset = reg->index;
6269
6270                         if (pri_addr == address) {
6271                                 *priv_offset = sys_offset;
6272                                 return 0;
6273                         }
6274                 }
6275         }
6276
6277         /* Process the TPC segment. */
6278         if (addr_type == CTXSW_ADDR_TYPE_TPC) {
6279                 for (tpc_num = 0; tpc_num < num_tpcs; tpc_num++) {
6280                         for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.tpc.count; i++) {
6281                                 reg = &g->gr.ctx_vars.ctxsw_regs.tpc.l[i];
6282                                 address = reg->addr;
6283                                 tpc_addr = pri_tpccs_addr_mask(address);
6284                                 base_address = proj_gpc_base_v() +
6285                                         (gpc_num * proj_gpc_stride_v()) +
6286                                         proj_tpc_in_gpc_base_v() +
6287                                         (tpc_num * proj_tpc_in_gpc_stride_v());
6288                                 address = base_address + tpc_addr;
6289                                 /*
6290                                  * The data for the TPCs is interleaved in the context buffer.
6291                                  * Example with num_tpcs = 2
6292                                  * 0    1    2    3    4    5    6    7    8    9    10   11 ...
6293                                  * 0-0  1-0  0-1  1-1  0-2  1-2  0-3  1-3  0-4  1-4  0-5  1-5 ...
6294                                  */
6295                                 tpc_offset = (reg->index * num_tpcs) + (tpc_num * 4);
6296
6297                                 if (pri_addr == address) {
6298                                         *priv_offset = tpc_offset;
6299                                         return 0;
6300                                 }
6301                         }
6302                 }
6303         }
6304
6305         /* Process the PPC segment. */
6306         if (addr_type == CTXSW_ADDR_TYPE_PPC) {
6307                 for (ppc_num = 0; ppc_num < num_ppcs; ppc_num++) {
6308                         for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.ppc.count; i++) {
6309                                 reg = &g->gr.ctx_vars.ctxsw_regs.ppc.l[i];
6310                                 address = reg->addr;
6311                                 ppc_addr = pri_ppccs_addr_mask(address);
6312                                 base_address = proj_gpc_base_v() +
6313                                         (gpc_num * proj_gpc_stride_v()) +
6314                                         proj_ppc_in_gpc_base_v() +
6315                                         (ppc_num * proj_ppc_in_gpc_stride_v());
6316                                 address = base_address + ppc_addr;
6317                                 /*
6318                                  * The data for the PPCs is interleaved in the context buffer.
6319                                  * Example with numPpcs = 2
6320                                  * 0    1    2    3    4    5    6    7    8    9    10   11 ...
6321                                  * 0-0  1-0  0-1  1-1  0-2  1-2  0-3  1-3  0-4  1-4  0-5  1-5 ...
6322                                  */
6323                                 ppc_offset = (reg->index * num_ppcs) + (ppc_num * 4);
6324
6325                                 if (pri_addr == address)  {
6326                                         *priv_offset = ppc_offset;
6327                                         return 0;
6328                                 }
6329                         }
6330                 }
6331         }
6332
6333
6334         /* Process the GPC segment. */
6335         if (addr_type == CTXSW_ADDR_TYPE_GPC) {
6336                 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.gpc.count; i++) {
6337                         reg = &g->gr.ctx_vars.ctxsw_regs.gpc.l[i];
6338
6339                         address = reg->addr;
6340                         gpc_addr = pri_gpccs_addr_mask(address);
6341                         gpc_offset = reg->index;
6342
6343                         base_address = proj_gpc_base_v() +
6344                                 (gpc_num * proj_gpc_stride_v());
6345                         address = base_address + gpc_addr;
6346
6347                         if (pri_addr == address) {
6348                                 *priv_offset = gpc_offset;
6349                                 return 0;
6350                         }
6351                 }
6352         }
6353
6354         return -EINVAL;
6355 }
6356
6357 static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
6358                                                void *context,
6359                                                u32 *num_ppcs, u32 *ppc_mask,
6360                                                u32 *reg_ppc_count)
6361 {
6362         u32 data32;
6363         u32 litter_num_pes_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
6364
6365         /*
6366          * if there is only 1 PES_PER_GPC, then we put the PES registers
6367          * in the GPC reglist, so we can't error out if ppc.count == 0
6368          */
6369         if ((!g->gr.ctx_vars.valid) ||
6370             ((g->gr.ctx_vars.ctxsw_regs.ppc.count == 0) &&
6371              (litter_num_pes_per_gpc > 1)))
6372                 return -EINVAL;
6373
6374         data32 = gk20a_mem_rd32(context + ctxsw_prog_local_image_ppc_info_o(), 0);
6375
6376         *num_ppcs = ctxsw_prog_local_image_ppc_info_num_ppcs_v(data32);
6377         *ppc_mask = ctxsw_prog_local_image_ppc_info_ppc_mask_v(data32);
6378
6379         *reg_ppc_count = g->gr.ctx_vars.ctxsw_regs.ppc.count;
6380
6381         return 0;
6382 }
6383
6384
6385
6386 /*
6387  *  This function will return the 32 bit offset for a priv register if it is
6388  *  present in the context buffer.
6389  */
6390 static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
6391                                                u32 addr,
6392                                                bool is_quad, u32 quad,
6393                                                u32 *context_buffer,
6394                                                u32 context_buffer_size,
6395                                                u32 *priv_offset)
6396 {
6397         struct gr_gk20a *gr = &g->gr;
6398         u32 i, data32;
6399         int err;
6400         int addr_type; /*enum ctxsw_addr_type */
6401         u32 broadcast_flags;
6402         u32 gpc_num, tpc_num, ppc_num, be_num;
6403         u32 num_gpcs, num_tpcs, num_ppcs;
6404         u32 offset;
6405         u32 sys_priv_offset, gpc_priv_offset;
6406         u32 ppc_mask, reg_list_ppc_count;
6407         void *context;
6408         u32 offset_to_segment;
6409
6410         gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
6411
6412         err = gr_gk20a_decode_priv_addr(g, addr, &addr_type,
6413                                         &gpc_num, &tpc_num, &ppc_num, &be_num,
6414                                         &broadcast_flags);
6415         if (err)
6416                 return err;
6417
6418         context = context_buffer;
6419         if (!check_main_image_header_magic(context)) {
6420                 gk20a_err(dev_from_gk20a(g),
6421                            "Invalid main header: magic value");
6422                 return -EINVAL;
6423         }
6424         num_gpcs = gk20a_mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0);
6425
6426         /* Parse the FECS local header. */
6427         context += ctxsw_prog_ucode_header_size_in_bytes();
6428         if (!check_local_header_magic(context)) {
6429                 gk20a_err(dev_from_gk20a(g),
6430                            "Invalid FECS local header: magic value\n");
6431                 return -EINVAL;
6432         }
6433         data32 = gk20a_mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0);
6434         sys_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
6435
6436         /* If found in Ext buffer, ok.
6437          * If it failed and we expected to find it there (quad offset)
6438          * then return the error.  Otherwise continue on.
6439          */
6440         err = gr_gk20a_find_priv_offset_in_ext_buffer(g,
6441                                       addr, is_quad, quad, context_buffer,
6442                                       context_buffer_size, priv_offset);
6443         if (!err || (err && is_quad))
6444                 return err;
6445
6446         if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
6447             (addr_type == CTXSW_ADDR_TYPE_BE)) {
6448                 /* Find the offset in the FECS segment. */
6449                 offset_to_segment = sys_priv_offset *
6450                         ctxsw_prog_ucode_header_size_in_bytes();
6451
6452                 err = gr_gk20a_process_context_buffer_priv_segment(g,
6453                                            addr_type, addr,
6454                                            0, 0, 0, 0,
6455                                            &offset);
6456                 if (err)
6457                         return err;
6458
6459                 *priv_offset = (offset_to_segment + offset);
6460                 return 0;
6461         }
6462
6463         if ((gpc_num + 1) > num_gpcs)  {
6464                 gk20a_err(dev_from_gk20a(g),
6465                            "GPC %d not in this context buffer.\n",
6466                            gpc_num);
6467                 return -EINVAL;
6468         }
6469
6470         /* Parse the GPCCS local header(s).*/
6471         for (i = 0; i < num_gpcs; i++) {
6472                 context += ctxsw_prog_ucode_header_size_in_bytes();
6473                 if (!check_local_header_magic(context)) {
6474                         gk20a_err(dev_from_gk20a(g),
6475                                    "Invalid GPCCS local header: magic value\n");
6476                         return -EINVAL;
6477
6478                 }
6479                 data32 = gk20a_mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0);
6480                 gpc_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
6481
6482                 err = gr_gk20a_determine_ppc_configuration(g, context,
6483                                                            &num_ppcs, &ppc_mask,
6484                                                            &reg_list_ppc_count);
6485                 if (err)
6486                         return err;
6487
6488                 num_tpcs = gk20a_mem_rd32(context + ctxsw_prog_local_image_num_tpcs_o(), 0);
6489
6490                 if ((i == gpc_num) && ((tpc_num + 1) > num_tpcs)) {
6491                         gk20a_err(dev_from_gk20a(g),
6492                            "GPC %d TPC %d not in this context buffer.\n",
6493                                    gpc_num, tpc_num);
6494                         return -EINVAL;
6495                 }
6496
6497                 /* Find the offset in the GPCCS segment.*/
6498                 if (i == gpc_num) {
6499                         offset_to_segment = gpc_priv_offset *
6500                                 ctxsw_prog_ucode_header_size_in_bytes();
6501
6502                         if (addr_type == CTXSW_ADDR_TYPE_TPC) {
6503                                 /*reg = gr->ctx_vars.ctxsw_regs.tpc.l;*/
6504                         } else if (addr_type == CTXSW_ADDR_TYPE_PPC) {
6505                                 /* The ucode stores TPC data before PPC data.
6506                                  * Advance offset past TPC data to PPC data. */
6507                                 offset_to_segment +=
6508                                         ((gr->ctx_vars.ctxsw_regs.tpc.count *
6509                                           num_tpcs) << 2);
6510                         } else if (addr_type == CTXSW_ADDR_TYPE_GPC) {
6511                                 /* The ucode stores TPC/PPC data before GPC data.
6512                                  * Advance offset past TPC/PPC data to GPC data. */
6513                                 /* note 1 PES_PER_GPC case */
6514                                 u32 litter_num_pes_per_gpc =
6515                                         proj_scal_litter_num_pes_per_gpc_v();
6516                                 if (litter_num_pes_per_gpc > 1) {
6517                                         offset_to_segment +=
6518                                                 (((gr->ctx_vars.ctxsw_regs.tpc.count *
6519                                                    num_tpcs) << 2) +
6520                                                  ((reg_list_ppc_count * num_ppcs) << 2));
6521                                 } else {
6522                                         offset_to_segment +=
6523                                                 ((gr->ctx_vars.ctxsw_regs.tpc.count *
6524                                                   num_tpcs) << 2);
6525                                 }
6526                         } else {
6527                                 gk20a_err(dev_from_gk20a(g),
6528                                            " Unknown address type.\n");
6529                                 return -EINVAL;
6530                         }
6531                         err = gr_gk20a_process_context_buffer_priv_segment(g,
6532                                                            addr_type, addr,
6533                                                            i, num_tpcs,
6534                                                            num_ppcs, ppc_mask,
6535                                                            &offset);
6536                         if (err)
6537                             return -EINVAL;
6538
6539                         *priv_offset = offset_to_segment + offset;
6540                         return 0;
6541                 }
6542         }
6543
6544         return -EINVAL;
6545 }
6546
6547
6548 int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
6549                           struct nvhost_dbg_gpu_reg_op *ctx_ops, u32 num_ops,
6550                           u32 num_ctx_wr_ops, u32 num_ctx_rd_ops)
6551 {
6552         struct gk20a *g = ch->g;
6553         struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
6554         void *ctx_ptr = NULL;
6555         int curr_gr_chid, curr_gr_ctx;
6556         bool ch_is_curr_ctx, restart_gr_ctxsw = false;
6557         u32 i, j, offset, v;
6558         u32 max_offsets = proj_scal_litter_num_gpcs_v() *
6559                 proj_scal_litter_num_tpc_per_gpc_v();
6560         u32 *offsets = NULL;
6561         u32 *offset_addrs = NULL;
6562         u32 ctx_op_nr, num_ctx_ops[2] = {num_ctx_wr_ops, num_ctx_rd_ops};
6563         int err, pass;
6564
6565         gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "wr_ops=%d rd_ops=%d",
6566                    num_ctx_wr_ops, num_ctx_rd_ops);
6567
6568         /* disable channel switching.
6569          * at that point the hardware state can be inspected to
6570          * determine if the context we're interested in is current.
6571          */
6572         err = gr_gk20a_disable_ctxsw(g);
6573         if (err) {
6574                 gk20a_err(dev_from_gk20a(g), "unable to stop gr ctxsw");
6575                 /* this should probably be ctx-fatal... */
6576                 goto cleanup;
6577         }
6578
6579         restart_gr_ctxsw = true;
6580
6581         curr_gr_ctx  = gk20a_readl(g, gr_fecs_current_ctx_r());
6582         curr_gr_chid = gk20a_gr_get_chid_from_ctx(g, curr_gr_ctx);
6583         ch_is_curr_ctx = (curr_gr_chid != -1) && (ch->hw_chid == curr_gr_chid);
6584
6585         gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "is curr ctx=%d", ch_is_curr_ctx);
6586         if (ch_is_curr_ctx) {
6587                 for (pass = 0; pass < 2; pass++) {
6588                         ctx_op_nr = 0;
6589                         for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
6590                                 /* only do ctx ops and only on the right pass */
6591                                 if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
6592                                     (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
6593                                      ((pass == 1) && !reg_op_is_read(ctx_ops[i].op))))
6594                                         continue;
6595
6596                                 /* if this is a quad access, setup for special access*/
6597                                 if (ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD)
6598                                                 && g->ops.gr.access_smpc_reg)
6599                                         g->ops.gr.access_smpc_reg(g,
6600                                                         ctx_ops[i].quad,
6601                                                         ctx_ops[i].offset);
6602                                 offset = ctx_ops[i].offset;
6603
6604                                 if (pass == 0) { /* write pass */
6605                                         v = gk20a_readl(g, offset);
6606                                         v &= ~ctx_ops[i].and_n_mask_lo;
6607                                         v |= ctx_ops[i].value_lo;
6608                                         gk20a_writel(g, offset, v);
6609
6610                                         gk20a_dbg(gpu_dbg_gpu_dbg,
6611                                                    "direct wr: offset=0x%x v=0x%x",
6612                                                    offset, v);
6613
6614                                         if (ctx_ops[i].op == REGOP(WRITE_64)) {
6615                                                 v = gk20a_readl(g, offset + 4);
6616                                                 v &= ~ctx_ops[i].and_n_mask_hi;
6617                                                 v |= ctx_ops[i].value_hi;
6618                                                 gk20a_writel(g, offset + 4, v);
6619
6620                                                 gk20a_dbg(gpu_dbg_gpu_dbg,
6621                                                            "direct wr: offset=0x%x v=0x%x",
6622                                                            offset + 4, v);
6623                                         }
6624
6625                                 } else { /* read pass */
6626                                         ctx_ops[i].value_lo =
6627                                                 gk20a_readl(g, offset);
6628
6629                                         gk20a_dbg(gpu_dbg_gpu_dbg,
6630                                                    "direct rd: offset=0x%x v=0x%x",
6631                                                    offset, ctx_ops[i].value_lo);
6632
6633                                         if (ctx_ops[i].op == REGOP(READ_64)) {
6634                                                 ctx_ops[i].value_hi =
6635                                                         gk20a_readl(g, offset + 4);
6636
6637                                                 gk20a_dbg(gpu_dbg_gpu_dbg,
6638                                                            "direct rd: offset=0x%x v=0x%x",
6639                                                            offset, ctx_ops[i].value_lo);
6640                                         } else
6641                                                 ctx_ops[i].value_hi = 0;
6642                                 }
6643                                 ctx_op_nr++;
6644                         }
6645                 }
6646                 goto cleanup;
6647         }
6648
6649         /* they're the same size, so just use one alloc for both */
6650         offsets = kzalloc(2 * sizeof(u32) * max_offsets, GFP_KERNEL);
6651         if (!offsets) {
6652                 err = -ENOMEM;
6653                 goto cleanup;
6654         }
6655         offset_addrs = offsets + max_offsets;
6656
6657         /* would have been a variant of gr_gk20a_apply_instmem_overrides */
6658         /* recoded in-place instead.*/
6659         ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
6660                         PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
6661                         0, pgprot_dmacoherent(PAGE_KERNEL));
6662         if (!ctx_ptr) {
6663                 err = -ENOMEM;
6664                 goto cleanup;
6665         }
6666
6667         gk20a_mm_l2_flush(g, true);
6668
6669         /* write to appropriate place in context image,
6670          * first have to figure out where that really is */
6671
6672         /* first pass is writes, second reads */
6673         for (pass = 0; pass < 2; pass++) {
6674                 ctx_op_nr = 0;
6675                 for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
6676                         u32 num_offsets;
6677
6678                         /* only do ctx ops and only on the right pass */
6679                         if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
6680                             (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
6681                              ((pass == 1) && !reg_op_is_read(ctx_ops[i].op))))
6682                                 continue;
6683
6684                         err = gr_gk20a_get_ctx_buffer_offsets(g,
6685                                                 ctx_ops[i].offset,
6686                                                 max_offsets,
6687                                                 offsets, offset_addrs,
6688                                                 &num_offsets,
6689                                                 ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD),
6690                                                 ctx_ops[i].quad);
6691                         if (err) {
6692                                 gk20a_dbg(gpu_dbg_gpu_dbg,
6693                                            "ctx op invalid offset: offset=0x%x",
6694                                            ctx_ops[i].offset);
6695                                 ctx_ops[i].status =
6696                                         NVHOST_DBG_GPU_REG_OP_STATUS_INVALID_OFFSET;
6697                                 continue;
6698                         }
6699
6700                         /* if this is a quad access, setup for special access*/
6701                         if (ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD) &&
6702                                         g->ops.gr.access_smpc_reg)
6703                                 g->ops.gr.access_smpc_reg(g, ctx_ops[i].quad,
6704                                                          ctx_ops[i].offset);
6705
6706                         for (j = 0; j < num_offsets; j++) {
6707                                 /* sanity check, don't write outside, worst case */
6708                                 if (offsets[j] >= g->gr.ctx_vars.golden_image_size)
6709                                         continue;
6710                                 if (pass == 0) { /* write pass */
6711                                         v = gk20a_mem_rd32(ctx_ptr + offsets[j], 0);
6712                                         v &= ~ctx_ops[i].and_n_mask_lo;
6713                                         v |= ctx_ops[i].value_lo;
6714                                         gk20a_mem_wr32(ctx_ptr + offsets[j], 0, v);
6715
6716                                         gk20a_dbg(gpu_dbg_gpu_dbg,
6717                                                    "context wr: offset=0x%x v=0x%x",
6718                                                    offsets[j], v);
6719
6720                                         if (ctx_ops[i].op == REGOP(WRITE_64)) {
6721                                                 v = gk20a_mem_rd32(ctx_ptr + offsets[j] + 4, 0);
6722                                                 v &= ~ctx_ops[i].and_n_mask_hi;
6723                                                 v |= ctx_ops[i].value_hi;
6724                                                 gk20a_mem_wr32(ctx_ptr + offsets[j] + 4, 0, v);
6725
6726                                                 gk20a_dbg(gpu_dbg_gpu_dbg,
6727                                                            "context wr: offset=0x%x v=0x%x",
6728                                                            offsets[j] + 4, v);
6729                                         }
6730
6731                                         /* check to see if we need to add a special WAR
6732                                            for some of the SMPC perf regs */
6733                                         gr_gk20a_ctx_patch_smpc(g, ch_ctx, offset_addrs[j],
6734                                                         v, ctx_ptr);
6735
6736                                 } else { /* read pass */
6737                                         ctx_ops[i].value_lo =
6738                                                 gk20a_mem_rd32(ctx_ptr + offsets[0], 0);
6739
6740                                         gk20a_dbg(gpu_dbg_gpu_dbg, "context rd: offset=0x%x v=0x%x",
6741                                                    offsets[0], ctx_ops[i].value_lo);
6742
6743                                         if (ctx_ops[i].op == REGOP(READ_64)) {
6744                                                 ctx_ops[i].value_hi =
6745                                                         gk20a_mem_rd32(ctx_ptr + offsets[0] + 4, 0);
6746
6747                                                 gk20a_dbg(gpu_dbg_gpu_dbg,
6748                                                            "context rd: offset=0x%x v=0x%x",
6749                                                            offsets[0] + 4, ctx_ops[i].value_hi);
6750                                         } else
6751                                                 ctx_ops[i].value_hi = 0;
6752                                 }
6753                         }
6754                         ctx_op_nr++;
6755                 }
6756         }
6757 #if 0
6758         /* flush cpu caches for the ctx buffer? only if cpu cached, of course.
6759          * they aren't, yet */
6760         if (cached) {
6761                 FLUSH_CPU_DCACHE(ctx_ptr,
6762                          sg_phys(ch_ctx->gr_ctx.mem.ref), size);
6763         }
6764 #endif
6765
6766  cleanup:
6767         if (offsets)
6768                 kfree(offsets);
6769
6770         if (ctx_ptr)
6771                 vunmap(ctx_ptr);
6772
6773         if (restart_gr_ctxsw) {
6774                 int tmp_err = gr_gk20a_enable_ctxsw(g);
6775                 if (tmp_err) {
6776                         gk20a_err(dev_from_gk20a(g), "unable to restart ctxsw!\n");
6777                         err = tmp_err;
6778                 }
6779         }
6780
6781         return err;
6782 }
6783
6784 static void gr_gk20a_cb_size_default(struct gk20a *g)
6785 {
6786         struct gr_gk20a *gr = &g->gr;
6787
6788         gr->attrib_cb_default_size =
6789                 gr_gpc0_ppc0_cbm_cfg_size_default_v();
6790         gr->alpha_cb_default_size =
6791                 gr_gpc0_ppc0_cbm_cfg2_size_default_v();
6792 }
6793
6794 static int gr_gk20a_calc_global_ctx_buffer_size(struct gk20a *g)
6795 {
6796         struct gr_gk20a *gr = &g->gr;
6797         int size;
6798
6799         gr->attrib_cb_size = gr->attrib_cb_default_size;
6800         gr->alpha_cb_size = gr->alpha_cb_default_size
6801                 + (gr->alpha_cb_default_size >> 1);
6802
6803         size = gr->attrib_cb_size *
6804                 gr_gpc0_ppc0_cbm_cfg_size_granularity_v() *
6805                 gr->max_tpc_count;
6806
6807         size += gr->alpha_cb_size *
6808                 gr_gpc0_ppc0_cbm_cfg2_size_granularity_v() *
6809                 gr->max_tpc_count;
6810
6811         return size;
6812 }
6813
6814 void gr_gk20a_commit_global_pagepool(struct gk20a *g,
6815                                             struct channel_ctx_gk20a *ch_ctx,
6816                                             u64 addr, u32 size, bool patch)
6817 {
6818         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_base_r(),
6819                 gr_scc_pagepool_base_addr_39_8_f(addr), patch);
6820
6821         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_r(),
6822                 gr_scc_pagepool_total_pages_f(size) |
6823                 gr_scc_pagepool_valid_true_f(), patch);
6824
6825         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_base_r(),
6826                 gr_gpcs_gcc_pagepool_base_addr_39_8_f(addr), patch);
6827
6828         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_r(),
6829                 gr_gpcs_gcc_pagepool_total_pages_f(size), patch);
6830
6831         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_pagepool_r(),
6832                 gr_pd_pagepool_total_pages_f(size) |
6833                 gr_pd_pagepool_valid_true_f(), patch);
6834 }
6835
6836 void gk20a_init_gr(struct gk20a *g)
6837 {
6838         init_waitqueue_head(&g->gr.init_wq);
6839 }
6840
6841 void gk20a_init_gr_ops(struct gpu_ops *gops)
6842 {
6843         gops->gr.access_smpc_reg = gr_gk20a_access_smpc_reg;
6844         gops->gr.bundle_cb_defaults = gr_gk20a_bundle_cb_defaults;
6845         gops->gr.cb_size_default = gr_gk20a_cb_size_default;
6846         gops->gr.calc_global_ctx_buffer_size =
6847                 gr_gk20a_calc_global_ctx_buffer_size;
6848         gops->gr.commit_global_attrib_cb = gr_gk20a_commit_global_attrib_cb;
6849         gops->gr.commit_global_bundle_cb = gr_gk20a_commit_global_bundle_cb;
6850         gops->gr.commit_global_cb_manager = gr_gk20a_commit_global_cb_manager;
6851         gops->gr.commit_global_pagepool = gr_gk20a_commit_global_pagepool;
6852         gops->gr.handle_sw_method = gr_gk20a_handle_sw_method;
6853         gops->gr.set_alpha_circular_buffer_size =
6854                 gk20a_gr_set_circular_buffer_size;
6855         gops->gr.set_circular_buffer_size =
6856                 gk20a_gr_set_alpha_circular_buffer_size;
6857         gops->gr.enable_hww_exceptions = gr_gk20a_enable_hww_exceptions;
6858         gops->gr.is_valid_class = gr_gk20a_is_valid_class;
6859         gops->gr.get_sm_dsm_perf_regs = gr_gk20a_get_sm_dsm_perf_regs;
6860         gops->gr.get_sm_dsm_perf_ctrl_regs = gr_gk20a_get_sm_dsm_perf_ctrl_regs;
6861         gops->gr.init_fs_state = gr_gk20a_ctx_state_floorsweep;
6862         gops->gr.set_hww_esr_report_mask = gr_gk20a_set_hww_esr_report_mask;
6863         gops->gr.setup_alpha_beta_tables = gr_gk20a_setup_alpha_beta_tables;
6864 }