๋‹จ์ผ ๋ธ”๋ก์„ ์‚ฌ์šฉํ•œ ๊ธฐ๋ณธ ๋ฒ„์ „

1D LayoutTensor a์™€ 1D LayoutTensor b์˜ 1D ํ•ฉ์„ฑ๊ณฑ์„ ๊ณ„์‚ฐํ•˜์—ฌ 1D LayoutTensor output์— ์ €์žฅํ•˜๋Š” ์ปค๋„์„ ๊ตฌํ˜„ํ•˜์„ธ์š”.

์ฐธ๊ณ : ์ผ๋ฐ˜์ ์ธ ๊ฒฝ์šฐ๋ฅผ ์ฒ˜๋ฆฌํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. ์Šค๋ ˆ๋“œ๋‹น ์ „์—ญ ์ฝ๊ธฐ 2ํšŒ, ์ „์—ญ ์“ฐ๊ธฐ 1ํšŒ๋งŒ ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค.

ํ•ต์‹ฌ ๊ฐœ๋…

์ด ํผ์ฆ์—์„œ ๋‹ค๋ฃจ๋Š” ๋‚ด์šฉ:

  • GPU์—์„œ ์Šฌ๋ผ์ด๋”ฉ ์œˆ๋„์šฐ ์—ฐ์‚ฐ ๊ตฌํ˜„ํ•˜๊ธฐ
  • ์Šค๋ ˆ๋“œ ๊ฐ„ ๋ฐ์ดํ„ฐ ์˜์กด์„ฑ ๊ด€๋ฆฌํ•˜๊ธฐ
  • ๊ฒน์น˜๋Š” ์˜์—ญ์— ๊ณต์œ  ๋ฉ”๋ชจ๋ฆฌ ํ™œ์šฉํ•˜๊ธฐ

ํ•ต์‹ฌ์€ ๊ฒฝ๊ณ„ ์กฐ๊ฑด์„ ์˜ฌ๋ฐ”๋ฅด๊ฒŒ ์œ ์ง€ํ•˜๋ฉด์„œ๋„ ๊ฒน์น˜๋Š” ์›์†Œ์— ํšจ์œจ์ ์œผ๋กœ ์ ‘๊ทผํ•˜๋Š” ๋ฐฉ๋ฒ•์„ ์ดํ•ดํ•˜๋Š” ๊ฒƒ์ž…๋‹ˆ๋‹ค.

๊ตฌ์„ฑ

  • ์ž…๋ ฅ ๋ฐฐ์—ด ํฌ๊ธฐ: SIZE = 6
  • ์ปค๋„ ํฌ๊ธฐ: CONV = 3
  • ๋ธ”๋ก๋‹น ์Šค๋ ˆ๋“œ ์ˆ˜: TPB = 8
  • ๋ธ”๋ก ์ˆ˜: 1
  • ๊ณต์œ  ๋ฉ”๋ชจ๋ฆฌ: SIZE์™€ CONV ํฌ๊ธฐ์˜ ๋ฐฐ์—ด 2๊ฐœ

์ฐธ๊ณ :

  • ๋ฐ์ดํ„ฐ ๋กœ๋”ฉ: ๊ฐ ์Šค๋ ˆ๋“œ๊ฐ€ ์ž…๋ ฅ ๋ฐฐ์—ด๊ณผ ์ปค๋„์—์„œ ์›์†Œ๋ฅผ ํ•˜๋‚˜์”ฉ ๋กœ๋“œ
  • ๋ฉ”๋ชจ๋ฆฌ ํŒจํ„ด: ์ž…๋ ฅ ๋ฐฐ์—ด๊ณผ ํ•ฉ์„ฑ๊ณฑ ์ปค๋„์„ ์ €์žฅํ•˜๋Š” ๊ณต์œ  ๋ฐฐ์—ด
  • ์Šค๋ ˆ๋“œ ๋™๊ธฐํ™”: ์—ฐ์‚ฐ ์‹œ์ž‘ ์ „ ์Šค๋ ˆ๋“œ ๊ฐ„ ์กฐ์œจ

์™„์„ฑํ•  ์ฝ”๋“œ

comptime TPB = 8
comptime SIZE = 6
comptime CONV = 3
comptime BLOCKS_PER_GRID = (1, 1)
comptime THREADS_PER_BLOCK = (TPB, 1)
comptime dtype = DType.float32
comptime in_layout = Layout.row_major(SIZE)
comptime out_layout = Layout.row_major(SIZE)
comptime conv_layout = Layout.row_major(CONV)


fn conv_1d_simple[
    in_layout: Layout, out_layout: Layout, conv_layout: Layout
](
    output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
    a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
    b: LayoutTensor[dtype, conv_layout, ImmutAnyOrigin],
):
    global_i = block_dim.x * block_idx.x + thread_idx.x
    local_i = Int(thread_idx.x)
    # FILL ME IN (roughly 14 lines)


์ „์ฒด ํŒŒ์ผ ๋ณด๊ธฐ: problems/p13/p13.mojo

ํŒ
  1. LayoutTensor[dtype, Layout.row_major(SIZE), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()์œผ๋กœ ๊ณต์œ  ๋ฉ”๋ชจ๋ฆฌ ํ• ๋‹น
  2. ์ž…๋ ฅ์„ shared_a[local_i]์—, ์ปค๋„์„ shared_b[local_i]์— ๋กœ๋“œ
  3. ๋ฐ์ดํ„ฐ ๋กœ๋“œ ํ›„ barrier() ํ˜ธ์ถœ
  4. ๊ฒฝ๊ณ„ ์•ˆ์—์„œ ๊ณฑ์„ ํ•ฉ์‚ฐ: if local_i + j < SIZE
  5. global_i < SIZE์ผ ๋•Œ๋งŒ ๊ฒฐ๊ณผ ๊ธฐ๋ก

์ฝ”๋“œ ์‹คํ–‰

์†”๋ฃจ์…˜์„ ํ…Œ์ŠคํŠธํ•˜๋ ค๋ฉด ํ„ฐ๋ฏธ๋„์—์„œ ๋‹ค์Œ ๋ช…๋ น์–ด๋ฅผ ์‹คํ–‰ํ•˜์„ธ์š”:

pixi run p13 --simple
pixi run -e amd p13 --simple
pixi run -e apple p13 --simple
uv run poe p13 --simple

ํผ์ฆ์„ ์•„์ง ํ’€์ง€ ์•Š์•˜๋‹ค๋ฉด ์ถœ๋ ฅ์€ ๋‹ค์Œ๊ณผ ๊ฐ™์Šต๋‹ˆ๋‹ค:

out: HostBuffer([0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
expected: HostBuffer([5.0, 8.0, 11.0, 14.0, 5.0, 0.0])

์†”๋ฃจ์…˜

fn conv_1d_simple[
    in_layout: Layout, out_layout: Layout, conv_layout: Layout
](
    output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
    a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
    b: LayoutTensor[dtype, conv_layout, ImmutAnyOrigin],
):
    global_i = block_dim.x * block_idx.x + thread_idx.x
    local_i = Int(thread_idx.x)
    shared_a = LayoutTensor[
        dtype,
        Layout.row_major(SIZE),
        MutAnyOrigin,
        address_space = AddressSpace.SHARED,
    ].stack_allocation()
    shared_b = LayoutTensor[
        dtype,
        Layout.row_major(CONV),
        MutAnyOrigin,
        address_space = AddressSpace.SHARED,
    ].stack_allocation()
    if global_i < SIZE:
        shared_a[local_i] = a[global_i]

    if global_i < CONV:
        shared_b[local_i] = b[global_i]

    barrier()

    # Note: this is unsafe as it enforces no guard so could access `shared_a` beyond its bounds
    # local_sum = Scalar[dtype](0)
    # for j in range(CONV):
    #     if local_i + j < SIZE:
    #         local_sum += shared_a[local_i + j] * shared_b[j]

    # if global_i < SIZE:
    #     out[global_i] = local_sum

    # Safe and correct:
    if global_i < SIZE:
        # Note: using `var` allows us to include the type in the type inference
        # `out.element_type` is available in LayoutTensor
        var local_sum: output.element_type = 0

        # Note: `@parameter` decorator unrolls the loop at compile time given `CONV` is a compile-time constant
        # See: https://docs.modular.com/mojo/manual/decorators/parameter/#parametric-for-statement
        @parameter
        for j in range(CONV):
            # Bonus: do we need this check for this specific example with fixed SIZE, CONV
            if local_i + j < SIZE:
                local_sum += shared_a[local_i + j] * shared_b[j]

        output[global_i] = local_sum


๊ณต์œ  ๋ฉ”๋ชจ๋ฆฌ๋ฅผ ํ™œ์šฉํ•ด ๊ฒน์น˜๋Š” ์›์†Œ์— ํšจ์œจ์ ์œผ๋กœ ์ ‘๊ทผํ•˜๋Š” 1D ํ•ฉ์„ฑ๊ณฑ ๊ตฌํ˜„์ž…๋‹ˆ๋‹ค. ๋‹จ๊ณ„๋ณ„๋กœ ์‚ดํŽด๋ณด๊ฒ ์Šต๋‹ˆ๋‹ค:

๋ฉ”๋ชจ๋ฆฌ ๋ ˆ์ด์•„์›ƒ

์ž…๋ ฅ ๋ฐฐ์—ด a:       [0  1  2  3  4  5]
์ปค๋„ b:          [0  1  2]

์—ฐ์‚ฐ ๊ณผ์ •

  1. ๋ฐ์ดํ„ฐ ๋กœ๋”ฉ:

    shared_a: [0  1  2  3  4  5]  // ์ž…๋ ฅ ๋ฐฐ์—ด
    shared_b: [0  1  2]           // ํ•ฉ์„ฑ๊ณฑ ์ปค๋„
    
  2. ๊ฐ ์œ„์น˜ i์— ๋Œ€ํ•œ ํ•ฉ์„ฑ๊ณฑ ์—ฐ์‚ฐ:

    output[0] = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] = 0*0 + 1*1 + 2*2 = 5
    output[1] = a[1]*b[0] + a[2]*b[1] + a[3]*b[2] = 1*0 + 2*1 + 3*2 = 8
    output[2] = a[2]*b[0] + a[3]*b[1] + a[4]*b[2] = 2*0 + 3*1 + 4*2 = 11
    output[3] = a[3]*b[0] + a[4]*b[1] + a[5]*b[2] = 3*0 + 4*1 + 5*2 = 14
    output[4] = a[4]*b[0] + a[5]*b[1] + 0*b[2]    = 4*0 + 5*1 + 0*2 = 5
    output[5] = a[5]*b[0] + 0*b[1]   + 0*b[2]     = 5*0 + 0*1 + 0*2 = 0
    

๊ตฌํ˜„ ์ƒ์„ธ

  1. ์Šค๋ ˆ๋“œ ์ฐธ์—ฌ ๋ฒ”์œ„์™€ ํšจ์œจ์„ฑ:

    • ์ ์ ˆํ•œ ์Šค๋ ˆ๋“œ ๊ฐ€๋“œ๊ฐ€ ์—†๋Š” ๋น„ํšจ์œจ์  ์ ‘๊ทผ:

      # ๋น„ํšจ์œจ์  ๋ฒ„์ „ - ๊ฒฐ๊ณผ๊ฐ€ ์‚ฌ์šฉ๋˜์ง€ ์•Š์„ ์Šค๋ ˆ๋“œ๋„ ๋ชจ๋‘ ์—ฐ์‚ฐ ์ˆ˜ํ–‰
      local_sum = Scalar[dtype](0)
      for j in range(CONV):
          if local_i + j < SIZE:
              local_sum += shared_a[local_i + j] * shared_b[j]
      # ๋งˆ์ง€๋ง‰ ์“ฐ๊ธฐ๋งŒ ๊ฐ€๋“œ
      if global_i < SIZE:
          output[global_i] = local_sum
      
    • ํšจ์œจ์ ์ด๊ณ  ์˜ฌ๋ฐ”๋ฅธ ๊ตฌํ˜„:

      if global_i < SIZE:
          var local_sum: output.element_type = 0  # var๋กœ ํƒ€์ž… ์ถ”๋ก  ํ™œ์šฉ
          @parameter  # CONV๊ฐ€ ์ƒ์ˆ˜์ด๋ฏ€๋กœ ์ปดํŒŒ์ผ ํƒ€์ž„์— ๋ฃจํ”„ ์ „๊ฐœ
          for j in range(CONV):
              if local_i + j < SIZE:
                  local_sum += shared_a[local_i + j] * shared_b[j]
          output[global_i] = local_sum
      

    ํ•ต์‹ฌ์ ์ธ ์ฐจ์ด๋Š” ๊ฐ€๋“œ์˜ ์œ„์น˜์ž…๋‹ˆ๋‹ค. ๋น„ํšจ์œจ์  ๋ฒ„์ „์€ global_i >= SIZE์ธ ์Šค๋ ˆ๋“œ๋ฅผ ํฌํ•จํ•ด ๋ชจ๋“  ์Šค๋ ˆ๋“œ๊ฐ€ ํ•ฉ์„ฑ๊ณฑ ์—ฐ์‚ฐ์„ ์ˆ˜ํ–‰ํ•œ ๋’ค, ๋งˆ์ง€๋ง‰ ์“ฐ๊ธฐ์—์„œ๋งŒ ๊ฐ€๋“œ๋ฅผ ์ ์šฉํ•ฉ๋‹ˆ๋‹ค. ์ด๋กœ ์ธํ•ด:

    • ๋ถˆํ•„์š”ํ•œ ์—ฐ์‚ฐ: ์œ ํšจ ๋ฒ”์œ„ ๋ฐ–์˜ ์Šค๋ ˆ๋“œ๊ฐ€ ์“ธ๋ชจ์—†๋Š” ์ž‘์—…์„ ์ˆ˜ํ–‰
    • ํšจ์œจ ์ €ํ•˜: ์‚ฌ์šฉ๋˜์ง€ ์•Š์„ ์—ฐ์‚ฐ์— ์ž์› ์†Œ๋น„
    • GPU ํ™œ์šฉ๋„ ์ €ํ•˜: ์˜๋ฏธ ์—†๋Š” ๊ณ„์‚ฐ์— GPU ์ฝ”์–ด๋ฅผ ๋‚ญ๋น„

    ํšจ์œจ์  ๋ฒ„์ „์€ ์œ ํšจํ•œ global_i ๊ฐ’์„ ๊ฐ€์ง„ ์Šค๋ ˆ๋“œ๋งŒ ์—ฐ์‚ฐ์„ ์ˆ˜ํ–‰ํ•˜๋ฏ€๋กœ GPU ์ž์›์„ ๋” ์ž˜ ํ™œ์šฉํ•ฉ๋‹ˆ๋‹ค.

  2. ์ฃผ์š” ๊ตฌํ˜„ ํŠน์ง•:

    • var์™€ output.element_type์œผ๋กœ ์ ์ ˆํ•œ ํƒ€์ž… ์ถ”๋ก 
    • @parameter ๋ฐ์ฝ”๋ ˆ์ดํ„ฐ๋กœ ํ•ฉ์„ฑ๊ณฑ ๋ฃจํ”„๋ฅผ ์ปดํŒŒ์ผ ํƒ€์ž„์— ์ „๊ฐœ
    • ์—„๊ฒฉํ•œ ๊ฒฝ๊ณ„ ๊ฒ€์‚ฌ๋กœ ๋ฉ”๋ชจ๋ฆฌ ์•ˆ์ „์„ฑ ํ™•๋ณด
    • LayoutTensor์˜ ํƒ€์ž… ์‹œ์Šคํ…œ์œผ๋กœ ์ฝ”๋“œ ์•ˆ์ „์„ฑ ํ–ฅ์ƒ
  3. ๋ฉ”๋ชจ๋ฆฌ ๊ด€๋ฆฌ:

    • ์ž…๋ ฅ ๋ฐฐ์—ด๊ณผ ์ปค๋„ ๋ชจ๋‘ ๊ณต์œ  ๋ฉ”๋ชจ๋ฆฌ ์‚ฌ์šฉ
    • ์Šค๋ ˆ๋“œ๋‹น ์ „์—ญ ๋ฉ”๋ชจ๋ฆฌ์—์„œ 1ํšŒ ๋กœ๋“œ
    • ๋กœ๋“œํ•œ ๋ฐ์ดํ„ฐ์˜ ํšจ์œจ์  ์žฌ์‚ฌ์šฉ
  4. ์Šค๋ ˆ๋“œ ์กฐ์œจ:

    • barrier()๋กœ ๋ชจ๋“  ๋ฐ์ดํ„ฐ ๋กœ๋“œ๊ฐ€ ๋๋‚œ ํ›„ ์—ฐ์‚ฐ ์‹œ์ž‘์„ ๋ณด์žฅ
    • ๊ฐ ์Šค๋ ˆ๋“œ๊ฐ€ ์ถœ๋ ฅ ์›์†Œ ํ•˜๋‚˜๋ฅผ ๊ณ„์‚ฐ
    • ๋ณ‘ํ•ฉ ๋ฉ”๋ชจ๋ฆฌ ์ ‘๊ทผ ํŒจํ„ด ์œ ์ง€
  5. ์„ฑ๋Šฅ ์ตœ์ ํ™”:

    • ์ „์—ญ ๋ฉ”๋ชจ๋ฆฌ ์ ‘๊ทผ ์ตœ์†Œํ™”
    • ๊ณต์œ  ๋ฉ”๋ชจ๋ฆฌ๋กœ ๋น ๋ฅธ ๋ฐ์ดํ„ฐ ์ ‘๊ทผ
    • ๋ฉ”์ธ ์—ฐ์‚ฐ ๋ฃจํ”„์—์„œ ์Šค๋ ˆ๋“œ ๋ถ„๊ธฐ ํšŒํ”ผ
    • @parameter ๋ฐ์ฝ”๋ ˆ์ดํ„ฐ๋ฅผ ํ†ตํ•œ ๋ฃจํ”„ ์ „๊ฐœ