本文整理汇总了Python中tvm.thread_axis函数的典型用法代码示例。如果您正苦于以下问题:Python thread_axis函数的具体用法?Python thread_axis怎么用?Python thread_axis使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了thread_axis函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_bound_nest_thread
def test_bound_nest_thread():
m = tvm.var('m')
A = tvm.placeholder((m), name='A')
A1 = tvm.compute((m,), lambda i: A[i], name='A1')
A2 = tvm.compute((m,), lambda i: A1[i] + 2, name='A2')
A3 = tvm.compute((m,), lambda i: A2[i] + 3, name='A3')
s = tvm.create_schedule(A3.op)
s[A2].set_scope("shared")
s[A1].set_scope("local")
block_x = tvm.thread_axis("blockIdx.x")
thread_x = tvm.thread_axis("threadIdx.x")
bx, tx = s[A3].split(A3.op.axis[0], factor=32)
s[A3].bind(bx, block_x)
s[A3].bind(tx, thread_x)
s[A2].compute_at(s[A3], tx)
_, xi = s[A2].split(A2.op.axis[0], nparts=1)
s[A2].bind(xi, thread_x)
s[A1].compute_at(s[A3], tx)
s = s.normalize()
bounds = tvm.schedule.InferBound(s)
assert(bounds[A1.op.axis[0]].extent.value==1)
assert(bounds[A2.op.axis[0]].extent.value==32)
assert(bounds[A3.op.axis[0]].extent == m)
开发者ID:bddppq,项目名称:tvm,代码行数:25,代码来源:test_schedule_bound_inference.py
示例2: _schedule_injective
def _schedule_injective(op, sch):
x = op.output(0)
fused = sch[x].fuse(*sch[x].op.axis)
num_thread = tvm.target.current_target(allow_none=False).max_num_threads
max_block = 256
try:
const_size = util.get_const_int(util.prod(x.shape))
max_block = 256
need_block_split = const_size > max_block * num_thread
except ValueError:
need_block_split = False
if need_block_split:
xo, xi = sch[x].split(fused, factor=num_thread * max_block)
bx, tx = sch[x].split(xi, factor=num_thread)
sch[x].reorder(bx, tx, xo)
sch[x].bind(bx, tvm.thread_axis("blockIdx.x"))
sch[x].bind(tx, tvm.thread_axis("threadIdx.x"))
else:
bx, tx = sch[x].split(fused, factor=num_thread)
sch[x].bind(tx, tvm.thread_axis("threadIdx.x"))
sch[x].bind(bx, tvm.thread_axis("blockIdx.x"))
return sch
开发者ID:bddppq,项目名称:tvm,代码行数:25,代码来源:injective.py
示例3: test_storage_share_gpu
def test_storage_share_gpu():
m = tvm.var('m')
A = [tvm.placeholder((m), name='A')]
num_stage = 5
for t in range(num_stage):
A.append(tvm.compute((m,), lambda i: A[-1][i] + (t+1), name='A%d_s' % t))
A.append(tvm.compute((m,), lambda i: A[-1][i], name='A%d' % t))
s = tvm.create_schedule(A[-1].op)
for t in range(num_stage):
x = A[2*t+2].op.axis[0]
bx, tx = s[A[2*t+2]].split(x, factor=32)
s[A[2*t+2]].bind(bx, tvm.thread_axis("blockIdx.x"))
s[A[2*t+2]].bind(tx, tvm.thread_axis("threadIdx.x"))
s[A[2*t+1]].compute_at(s[A[2*t+2]], tx)
s[A[2*t+1]].set_scope("shared")
bounds = tvm.schedule.InferBound(s)
assert isinstance(bounds, tvm.container.Map)
stmt = tvm.schedule.ScheduleOps(s, bounds)
Ab = tvm.decl_buffer(A[0].shape, A[0].dtype, name='A')
Bb = tvm.decl_buffer(A[0].shape, A[0].dtype, name='B')
stmt = tvm.ir_pass.StorageFlatten(stmt, {A[0]: Ab, A[-1]: Bb}, 64)
stmt = tvm.ir_pass.CanonicalSimplify(stmt)
stmt = tvm.ir_pass.Simplify(stmt)
stmt = tvm.ir_pass.StorageRewrite(stmt)
alloc_stats = {"global": 0, "shared": 0}
def verify(n):
if isinstance(n, tvm.stmt.AttrStmt):
if n.attr_key == "storage_scope":
alloc_stats[n.value.value] += 1
tvm.ir_pass.PostOrderVisit(stmt, verify)
assert alloc_stats["global"] == 2
assert alloc_stats["shared"] == num_stage
开发者ID:bddppq,项目名称:tvm,代码行数:34,代码来源:test_pass_storage_rewrite.py
示例4: extern
def extern(ins, outs):
# pylint: disable=unused-argument
"""construct measurement function by building IR directly"""
ib = tvm.ir_builder.create()
bx = tvm.thread_axis("blockIdx.x")
tx = tvm.thread_axis("threadIdx.x")
ib.scope_attr(bx, "thread_extent", n // max_threads)
ib.scope_attr(tx, "thread_extent", max_threads)
idx = bx.var * max_threads + tx.var
a = ib.allocate(dtype, (1), name='a', scope='local')
b = ib.allocate(dtype, (1), name='b', scope='local')
a[0] = outs[0].vload(idx, dtype)
b[0] = outs[0].vload(idx, dtype)
if base_type.find('float') != -1:
mad_func = lambda x, y: (x * x + y)
else:
mad_func = lambda x, y: y * y + x
for _ in range(item_per_thread // 4 // lanes):
a[0] = mad_func(a[0], b[0])
b[0] = mad_func(b[0], a[0])
ib.emit(outs[0].vstore(idx, b[0]))
return ib.get()
开发者ID:LANHUIYING,项目名称:tvm,代码行数:30,代码来源:peak.py
示例5: get_gemm_feature
def get_gemm_feature(target):
k = tvm.reduce_axis((0, N), 'k')
A = tvm.placeholder((N, N), name='A')
B = tvm.placeholder((N, N), name='B')
C = tvm.compute(A.shape, lambda y, x: tvm.sum(A[y, k] * B[k, x], axis=k),
name='C')
s = tvm.create_schedule(C.op)
y, x = s[C].op.axis
axes = list(s[C].tile(y, x, 8, 8)) + [k]
perm = np.random.permutation(5)
axes = [axes[x] for x in perm]
s[C].reorder(*axes)
if "gpu" in target.keys:
pick = []
# filter out reduction axis
for i in range(len(perm)):
if perm[i] != 4:
pick.append(axes[i])
s[C].bind(pick[0], tvm.thread_axis("blockIdx.x"))
s[C].bind(pick[1], tvm.thread_axis("vthread"))
s[C].bind(pick[2], tvm.thread_axis("threadIdx.y"))
with target:
feas = feature.get_itervar_feature(s, [A, B, C])
feas = feature.flatten_itervar_feature(feas)
return feas
开发者ID:LANHUIYING,项目名称:tvm,代码行数:29,代码来源:test_autotvm_feature.py
示例6: test_shared_memory
def test_shared_memory():
N = 1024
M = 128
A = tvm.placeholder((N,), name='A', dtype='float32')
B = tvm.compute((N, ), lambda i: A[i], name='B')
s = tvm.create_schedule([B.op])
AA = s.cache_read(A, "shared", [B])
o, i = s[B].split(s[B].op.axis[0], M)
s[AA].compute_at(s[B], o)
s[B].bind(o, tvm.thread_axis("blockIdx.x"))
s[B].bind(i, tvm.thread_axis("threadIdx.x"))
# shared memory usage: M * 4B
# thread usage: M
for target in ['opencl', 'cuda']:
if not tvm.context(target).exist:
continue
valid = [None]
with tvm.build_config(**{"add_lower_pass": [
(2, get_verify_pass(valid,
max_shared_memory_per_block=4 * M - 1,
max_threads_per_block=M))]}):
tvm.build(s, [A, B], target)
assert not valid[0]
with tvm.build_config(**{"add_lower_pass": [
(2, get_verify_pass(valid,
max_shared_memory_per_block=4 * M,
max_threads_per_block=M))]}):
tvm.build(s, [A, B], target)
assert valid[0]
开发者ID:bddppq,项目名称:tvm,代码行数:34,代码来源:test_pass_verify_gpu_code.py
示例7: check_cuda
def check_cuda(dtype, n, lanes):
if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
print("skip because cuda is not enabled..")
return
if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version):
print("skip because gpu does not support int8")
return
A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
B = tvm.placeholder((n,), name='B', dtype="%sx%d" % (dtype, lanes))
C = tvm.placeholder((n,), name='C', dtype="int32")
D = tvm.compute((n,),
lambda i: tvm.call_pure_extern("int32", "__dp4a", A[i], B[i], C[i]), name='D')
s = tvm.create_schedule(D.op)
xo, xi = s[D].split(D.op.axis[0], factor=num_thread)
s[D].bind(xo, tvm.thread_axis("blockIdx.x"))
s[D].bind(xi, tvm.thread_axis("threadIdx.x"))
fun = tvm.build(s, [A, B, C, D], "cuda")
np_a = np.random.randint(low=-128, high=127, size=(n,lanes))
np_b = np.random.randint(low=-128, high=127, size=(n,lanes))
np_c = np.random.randint(low=0, high=127, size=(n,))
np_d = [sum(x * y) + z for x, y, z in zip(np_a, np_b, np_c)]
ctx = tvm.gpu(0)
a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a)
b = tvm.nd.empty((n,), B.dtype, ctx).copyfrom(np_b)
c = tvm.nd.empty((n,), C.dtype, ctx).copyfrom(np_c)
d = tvm.nd.empty((n,), D.dtype, ctx)
fun(a, b, c, d)
tvm.testing.assert_allclose(d.asnumpy(), np_d)
开发者ID:bddppq,项目名称:tvm,代码行数:28,代码来源:test_codegen_cuda.py
示例8: run_opencl
def run_opencl():
# NOTE: This is the setting for my rk3399 board. You need to modify
# them according to your environment.
target_host = "llvm -target=aarch64-linux-gnu"
opencl_device_host = '10.77.1.145'
opencl_device_port = 9090
# create scheule for the above "add one" compute decleration
s = tvm.create_schedule(B.op)
xo, xi = s[B].split(B.op.axis[0], factor=32)
s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
func = tvm.build(s, [A, B], "opencl", target_host=target_host)
remote = rpc.connect(opencl_device_host, opencl_device_port)
# export and upload
path = temp.relpath('lib_cl.tar')
func.export_library(path)
remote.upload(path)
func = remote.load_module('lib_cl.tar')
# run
ctx = remote.cl()
a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
func(a, b)
np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
print("OpenCP test passed!")
开发者ID:LANHUIYING,项目名称:tvm,代码行数:29,代码来源:cross_compilation_and_rpc.py
示例9: test_exp
def test_exp():
# graph
n = tvm.convert(1024)
A = tvm.placeholder((n,), name='A')
B = tvm.compute(A.shape, lambda *i: tvm.exp(A(*i)), name='B')
s = tvm.create_schedule(B.op)
# create iter var and assign them tags.
num_thread = 8
bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
s[B].bind(tx, tvm.thread_axis("threadIdx.x"))
# one line to build the function.
def check_device(device, host="stackvm"):
if not tvm.module.enabled(host):
return
ctx = tvm.context(device, 0)
if not ctx.exist:
return
fexp = tvm.build(s, [A, B],
device, host,
name="myexp")
ctx = tvm.context(device, 0)
# launch the kernel.
n = 1024
a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
fexp(a, b)
np.testing.assert_allclose(
b.asnumpy(), np.exp(a.asnumpy()), rtol=1e-5)
check_device("cuda", "llvm")
check_device("vulkan")
check_device("opencl")
开发者ID:gwli,项目名称:tvm,代码行数:34,代码来源:test_ewise.py
示例10: test_multiple_kernels
def test_multiple_kernels():
N = 1024
A = tvm.placeholder((N, N), name='A')
B = tvm.compute((N, N), lambda i, j: A[i, j])
C = tvm.compute((N, N), lambda i, j: B[i, j])
s = tvm.create_schedule([C.op])
s[C].bind(s[C].op.axis[1], tvm.thread_axis("threadIdx.x"))
s[B].bind(s[B].op.axis[1], tvm.thread_axis("threadIdx.x"))
# shared memory usage: 0
# thread usage: N
for target in ['opencl', 'cuda']:
if not tvm.context(target).exist:
continue
valid = [None]
with tvm.build_config(**{"add_lower_pass": [
(2, get_verify_pass(valid,
max_shared_memory_per_block=0,
max_threads_per_block=N - 1))]}):
tvm.build(s, [A, C], target)
assert not valid[0]
with tvm.build_config(**{"add_lower_pass": [
(2, get_verify_pass(valid,
max_shared_memory_per_block=0,
max_threads_per_block=N))]}):
tvm.build(s, [A, C], target)
assert valid[0]
开发者ID:bddppq,项目名称:tvm,代码行数:33,代码来源:test_pass_verify_gpu_code.py
示例11: traverse
def traverse(op):
"""inline all one-to-one-mapping operators except the last stage (output)"""
if "nms" in op.tag:
sort = op.input_tensors[1]
score = s[sort].op.input_tensors[0]
fused = s[score].fuse(*s[score].op.axis)
num_thread = tvm.target.current_target(allow_none=False).max_num_threads
bx, tx = s[score].split(fused, factor=num_thread)
s[score].bind(bx, tvm.thread_axis("blockIdx.x"))
s[score].bind(tx, tvm.thread_axis("threadIdx.x"))
if tag.is_broadcast(op.tag):
if op not in s.outputs:
s[op].compute_inline()
else:
x = op.output(0)
fused = s[x].fuse(*s[x].op.axis)
num_thread = tvm.target.current_target(allow_none=False).max_num_threads
bx, tx = s[x].split(fused, factor=num_thread)
s[x].bind(bx, tvm.thread_axis("blockIdx.x"))
s[x].bind(tx, tvm.thread_axis("threadIdx.x"))
for tensor in op.input_tensors:
if tensor.op.input_tensors and tensor.op not in scheduled_ops:
traverse(tensor.op)
scheduled_ops.append(op)
开发者ID:LANHUIYING,项目名称:tvm,代码行数:25,代码来源:vision.py
示例12: try_warp_memory
def try_warp_memory():
"""skip this in default test because it require higher arch"""
m = 128
A = tvm.placeholder((m,), name='A')
B = tvm.compute((m,), lambda i: A[i] + 3, name='B')
warp_size = 32
s = tvm.create_schedule(B.op)
AA = s.cache_read(A, "warp", [B])
xo, xi = s[B].split(B.op.axis[0], warp_size * 2)
xi0, xi1 = s[B].split(xi, factor=warp_size)
tx = tvm.thread_axis("threadIdx.x")
s[B].bind(xi1, tx)
s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
s[AA].compute_at(s[B], xo)
xo, xi = s[AA].split(s[AA].op.axis[0], warp_size)
s[AA].bind(xi, tx)
@tvm.register_func
def tvm_callback_cuda_compile(code):
ptx = nvcc.compile_cuda(code, target="ptx")
return ptx
# one line to build the function.
def check_device(device):
ctx = tvm.context(device, 0)
if not ctx.exist:
print("skip because %s is not enabled.." % device)
return
f = tvm.build(s, [A, B], device)
a = tvm.nd.array((np.random.uniform(size=m) * 256).astype(A.dtype), ctx)
b = tvm.nd.array(np.zeros(m, dtype=B.dtype), ctx)
f(a, b)
tvm.testing.assert_allclose(
b.asnumpy(), a.asnumpy() + 3, rtol=1e-6)
check_device("cuda")
开发者ID:bddppq,项目名称:tvm,代码行数:35,代码来源:test_ewise.py
示例13: test_rfactor_argmax
def test_rfactor_argmax():
def fcombine(x, y):
lhs = tvm.make.Select((x[1] >= y[1]), x[0], y[0])
rhs = tvm.make.Select((x[1] >= y[1]), x[1], y[1])
return lhs, rhs
def fidentity(t0, t1):
return tvm.const(-1, t0), tvm.min_value(t1)
argmax = tvm.comm_reducer(fcombine,
fidentity,
name='argmax')
nn = 1027
mm = 10
n = tvm.convert(nn)
m = tvm.convert(mm)
A0 = tvm.placeholder((m, n), name='A0', dtype='int32')
A1 = tvm.placeholder((m, n), name='A1', dtype='float32')
k = tvm.reduce_axis((0, n))
B0, B1 = tvm.compute((m,), lambda i: argmax((A0[i, k], A1[i, k]), axis=k), name='B')
# schedule
s = tvm.create_schedule(B0.op)
nthread = 16
ko, kf = s[B0].split(k, factor=nthread)
BF0, BF1 = s.rfactor(B0, kf)
bx, ty = s[B0].split(s[B0].op.axis[0], factor=nthread)
s[B0].bind(bx, tvm.thread_axis("blockIdx.x"))
s[B0].bind(ty, tvm.thread_axis("threadIdx.y"))
tx = s[B0].op.reduce_axis[0]
thread_x = tvm.thread_axis("threadIdx.x")
s[B0].bind(tx, thread_x)
s[BF0.op].compute_at(s[B0], tx)
s[B0].set_store_predicate(thread_x.var.equal(0))
def check_target(device):
ctx = tvm.context(device, 0)
if not ctx.exist:
print("skip because %s is not enabled.." % device)
return
fapi = tvm.lower(s, args=[A0, A1, B0, B1])
fargmax = tvm.build(fapi,
target=device,
name="argmax")
np_idx = np.repeat(np.arange(nn, dtype='int32').reshape(1, nn), mm, axis=0)
np_val = np.random.uniform(size=(mm, nn)).astype('float32')
np_res = np.argmax(np_val, axis=1)
nd_idx = tvm.nd.array(np_idx, ctx)
nd_val = tvm.nd.array(np_val, ctx)
nd_res0 = tvm.nd.array(np.zeros(mm, dtype='int32'), ctx)
nd_res1 = tvm.nd.array(np.zeros(mm, dtype='float32'), ctx)
fargmax(nd_idx, nd_val, nd_res0, nd_res1)
tvm.testing.assert_allclose(np_res, nd_res0.asnumpy())
check_target("cuda")
check_target("vulkan")
开发者ID:bddppq,项目名称:tvm,代码行数:59,代码来源:test_reduce.py
示例14: nms_ir
def nms_ir(sorted_bbox_buf, out_buf, nms_threshold):
"""Non-maximum supression.
Parameters
----------
sorted_bbox_buf : tvm.schedule.Buffer
3-D with shape [batch, num_bbox, 5]. The last dimension is in format of
[w_start, h_start, w_end, h_end, score].
out_buf : tvm.schedule.Buffer
2-D with shape [batch, num_bbox]. Boolean mask of whether a bounding box should be removed.
nms_threshold : float
Non-maximum suppression threshold.
Returns
-------
stmt : Stmt
The result IR statement.
"""
def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
"""Calculate overlap of two boxes.
"""
w = tvm.max(0.0, tvm.min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2])
- tvm.max(out_tensor[box_a_idx], out_tensor[box_b_idx]) + 1.0)
h = tvm.max(0.0, tvm.min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3])
- tvm.max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1]) + 1.0)
i = w * h
u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx] + 1.0) * \
(out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1] + 1.0) + \
(out_tensor[box_b_idx + 2] - out_tensor[box_b_idx] + 1.0) * \
(out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1] + 1.0) - i
return i / u
batch, num_bbox = get_const_tuple(out_buf.shape)
max_threads = int(math.sqrt(tvm.target.current_target(allow_none=False).max_num_threads))
tx = tvm.thread_axis("threadIdx.x")
bx = tvm.thread_axis("blockIdx.x")
ib = tvm.ir_builder.create()
p_data = ib.buffer_ptr(sorted_bbox_buf)
p_out = ib.buffer_ptr(out_buf)
nthread_tx = max_threads
nthread_bx = num_bbox // max_threads + 1
ib.scope_attr(tx, "thread_extent", nthread_tx)
ib.scope_attr(bx, "thread_extent", nthread_bx)
i = bx * max_threads + tx
with ib.for_range(0, batch, for_type="unroll", name="n") as b:
base_idx = b * num_bbox
with ib.if_scope(i < num_bbox):
p_out[base_idx + i] = False
with ib.for_range(0, num_bbox - 1) as l:
with ib.if_scope(tvm.all(i < num_bbox, i > l, p_out[base_idx + l] == False)):
iou = calculate_overlap(p_data, (base_idx + l) * 5, (base_idx + i) * 5)
with ib.if_scope(iou > nms_threshold):
p_out[base_idx + i] = True
ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
tvm.convert(['shared']),
tvm.expr.Call.Intrinsic, None, 0))
return ib.get()
开发者ID:bddppq,项目名称:tvm,代码行数:59,代码来源:proposal.py
示例15: fuse_and_bind
def fuse_and_bind(s, tensor, axis=None, num_thread=None):
""" fuse all the axis and bind to GPU threads """
axis = axis or s[tensor].op.axis
fused = s[tensor].fuse(*axis)
bx, tx = s[tensor].split(fused, num_thread)
s[tensor].bind(bx, tvm.thread_axis("blockIdx.x"))
s[tensor].bind(tx, tvm.thread_axis("threadIdx.x"))
return bx, tx
开发者ID:LANHUIYING,项目名称:tvm,代码行数:8,代码来源:dense.py
示例16: _schedule_output
def _schedule_output(op, sch):
x = op.output(0)
fused = sch[x].fuse(*sch[x].op.axis)
num_thread = tvm.target.current_target(allow_none=False).max_num_threads
bx, tx = sch[x].split(fused, factor=num_thread)
sch[x].bind(bx, tvm.thread_axis("blockIdx.x"))
sch[x].bind(tx, tvm.thread_axis("threadIdx.x"))
return sch
开发者ID:gwli,项目名称:tvm,代码行数:8,代码来源:extern.py
示例17: test_device_module_dump
def test_device_module_dump():
# graph
n = tvm.convert(1024)
A = tvm.placeholder((n,), name='A')
B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
s = tvm.create_schedule(B.op)
# create iter var and assign them tags.
num_thread = 8
bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
s[B].bind(tx, tvm.thread_axis("threadIdx.x"))
def check_device(device):
ctx = tvm.context(device, 0)
if not ctx.exist:
print("Skip because %s is not enabled" % device)
return
temp = util.tempdir()
name = "myadd_%s" % device
if sys.platform == "darwin" or sys.platform.startswith('linux'):
f = tvm.build(s, [A, B], device, "llvm -system-lib", name=name)
elif sys.platform == "win32":
f = tvm.build(s, [A, B], device, "llvm", name=name)
else:
raise ValueError("Unsupported platform")
path_dso = temp.relpath("dev_lib.so")
f.export_library(path_dso)
f1 = tvm.module.load(path_dso)
a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
f1(a, b)
np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
if sys.platform != "win32":
f2 = tvm.module.system_lib()
f2[name](a, b)
np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
def check_stackvm(device):
ctx = tvm.context(device, 0)
if not ctx.exist:
print("Skip because %s is not enabled" % device)
return
temp = util.tempdir()
name = "myadd_%s" % device
f = tvm.build(s, [A, B], device, "stackvm", name=name)
path_dso = temp.relpath("dev_lib.stackvm")
#f.export_library(path_dso)
#f1 = tvm.module.load(path_dso)
a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
f(a, b)
np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
for device in ["cuda", "vulkan", "opencl", "metal"]:
check_device(device)
check_stackvm(device)
开发者ID:LANHUIYING,项目名称:tvm,代码行数:58,代码来源:test_module_load.py
示例18: tile_and_bind
def tile_and_bind(s, tensor, y, x, y_factor, x_factor=None):
""" tile and bind to GPU threads """
x_factor = x_factor or y_factor
yo, xo, yi, xi = s[tensor].tile(y, x, y_factor, x_factor)
s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
return yo, xo, yi, xi
开发者ID:LANHUIYING,项目名称:tvm,代码行数:9,代码来源:conv2d.py
示例19: get_valid_counts_upsweep
def get_valid_counts_upsweep(data, idx_in, idx, partial):
"""Low level IR of first step of scan: unsweep.
Parameters
----------
data: Buffer
3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms.
idx_in : Buffer
2D Buffer of valid data indices with shape [batch_size, num_anchors].
idx : Buffer
2D Buffer of valid data indices with shape [batch_size, num_anchors].
partial : Buffer
2D Buffer of valid data indices with shape [batch_size, new_range].
Returns
-------
stmt : Stmt
The result IR statement.
"""
batch_size = data.shape[0]
num_anchors = data.shape[1]
ib = tvm.ir_builder.create()
data = ib.buffer_ptr(data)
idx_in = ib.buffer_ptr(idx_in)
idx = ib.buffer_ptr(idx)
partial = ib.buffer_ptr(partial)
max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
elem_per_thread = num_anchors // max_threads + 1
nthread_tx = max_threads
nthread_bx = batch_size
tx = tvm.thread_axis("threadIdx.x")
bx = tvm.thread_axis("blockIdx.x")
ib.scope_attr(tx, "thread_extent", nthread_tx)
ib.scope_attr(bx, "thread_extent", nthread_bx)
new_range = num_anchors // elem_per_thread + 1
# Scan: Upsweep:
with ib.if_scope(tvm.all(bx < batch_size, tx < new_range)):
with ib.for_range(0, elem_per_thread) as i:
with ib.if_scope(bx * num_anchors + \
tx * elem_per_thread + i < batch_size * num_anchors):
with ib.if_scope(i == 0):
partial[bx * new_range + tx] = idx_in[bx * num_anchors + tx * elem_per_thread]
idx[bx * num_anchors + tx * elem_per_thread] = \
idx_in[bx * num_anchors + tx * elem_per_thread]
with ib.else_scope():
partial[bx * new_range + tx] += \
idx_in[bx * num_anchors + tx * elem_per_thread + i]
idx[bx * num_anchors + tx * elem_per_thread + i] = \
idx[bx * num_anchors + tx * elem_per_thread + i - 1] + \
idx_in[bx * num_anchors + tx * elem_per_thread + i]
ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
tvm.convert(['shared']),
tvm.expr.Call.Intrinsic, None, 0))
return ib.get()
开发者ID:bddppq,项目名称:tvm,代码行数:57,代码来源:nms.py
示例20: fuse_and_bind
def fuse_and_bind(s, tensor, axis=None, num_thread=None):
""" fuse all the axis and bind to GPU threads """
axis = axis or s[tensor].op.axis
fused = s[tensor].fuse(*axis)
max_threads = tvm.target.current_target(allow_none=False).max_num_threads
bx, tx = s[tensor].split(fused, num_thread or max_threads)
s[tensor].bind(bx, tvm.thread_axis("blockIdx.x"))
s[tensor].bind(tx, tvm.thread_axis("threadIdx.x"))
return bx, tx
开发者ID:gwli,项目名称:tvm,代码行数:9,代码来源:conv2d.py
注:本文中的tvm.thread_axis函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论