이권일 Sse 를 이용한 최적화와 실제 사용 예

SSE 를 이용한 최적화와 실제 사용 예 이권일 EA Seoul Studio (BFO)

발표 대상 C/C++ 프로그래머 H/W 및 최적화에 관심 있는 자 GPGPU 를 준비하는 자

SSE (SIMD Streaming Extension) 1999년 펜티엄3 에 처음 포함된 확장 기능 Float Point 및 비교 로직 등 다양한 연산 SSE 전용 128bit XMM 레지스터 8개 추가 MMX 와 달리 거의 모든 기능이 구현됨

SIMD 연산 일반연산 1.0 2.0 3.0 4.0 1.0 5.0 6.0 7.0 8.0 5.0 6.0 8.0 10.0 12.0 6.0

__m128자료형 typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128 { float m128_f32[4]; unsigned __int64 m128_u64[2]; __int8 m128_i8[16]; __int16 m128_i16[8]; __int32 m128_i32[4]; __int64 m128_i64[2]; unsigned __int8 m128_u8[16]; unsigned __int16 m128_u16[8]; unsigned __int32 m128_u32[4]; } __m128; ,[object Object]

SSE 2 부터 새로이 추가된 __int64 와 double 을 지원하기 위한 __m128i, __m128d 자료형도 있음

명령어에 따라 2,4,8,16 SIMD 연산이 수행될 수 있음. 구조체에는 어떤 데이터가 들어 있는지 알 수 없음,[object Object]

편하게 코딩하기 // 산술 연산자 __forceinline__m128operator+(__m128 l, __m128 r) { return_mm_add_ps(l,r); } __forceinline__m128operator-(__m128 l, __m128 r) { return_mm_sub_ps(l,r); } __forceinline__m128operator*(__m128 l, __m128 r) { return_mm_mul_ps(l,r); } __forceinline__m128operator/(__m128 l, __m128 r) { return_mm_div_ps(l,r); } __forceinline__m128operator+(__m128 l, float r) { return_mm_add_ps(l,_mm_set1_ps(r)); } __forceinline__m128operator-(__m128 l, float r) { return_mm_sub_ps(l, _mm_set1_ps(r)); } __forceinline__m128operator*(__m128 l, float r) { return_mm_mul_ps(l, _mm_set1_ps(r)); } __forceinline__m128operator/(__m128 l, float r) { return_mm_div_ps(l, _mm_set1_ps(r)); } // 논리 연산자 __forceinline__m128operator&(__m128 l, __m128 r) { return_mm_and_ps(l,r); } __forceinline__m128operator|(__m128 l, __m128 r) { return_mm_or_ps(l,r); } // 비교 연산자 __forceinline__m128operator<(__m128 l, __m128 r) { return_mm_cmplt_ps(l,r); } __forceinline__m128operator>(__m128 l, __m128 r) { return_mm_cmpgt_ps(l,r); } __forceinline__m128operator<=(__m128 l, __m128 r) { return_mm_cmple_ps(l,r); } __forceinline__m128operator>=(__m128 l, __m128 r) { return_mm_cmpge_ps(l,r); } __forceinline__m128operator!=(__m128 l, __m128 r) { return_mm_cmpneq_ps(l,r); } __forceinline__m128operator==(__m128 l, __m128 r) { return_mm_cmpeq_ps(l,r); }

SIMD 정말 4배 빠른가요? // C 버젼 for(size_ti=0; i<count;++i) { b[i] = a[i] + a[i]; } -> 실행 시간 49.267 ms // Compiler Intrinsic 버젼 for(size_ti=0; i<count/4;++i) { b4[i] = a4[i] + a4[i]; } -> 실행 시간 47.927 ms

메모리 병목!! a[0] b[0] + a[1] b[1] + a[2] B[2] + a[3] b[3] + a[4] + a[5] + a[0] b[0] a[1] b[1] a[2] b[2] a[3] b[3] a[4] a[5] a[6] a[7] + + + + + + +

연산량을 늘리자! sinf() // sin(a) = a – (a^3)/3! + (a^5)/5! – (a^7)/7! … float req_3f = 1.0f / (3.0*2.0*1.0); float req_5f = 1.0f / (5.0*4.0*3.0*2.0*1.0); float req_7f = 1.0f / (7.0*6.0*5.0*4.0*3.0*2.0*1.0); for(size_ti=0; i<count; ++i) { b[i] = a[i] - a[i]*a[i]*a[i]*req_3f + a[i]*a[i]*a[i]*a[i]*a[i]*req_5f - a[i]*a[i]*a[i]*a[i]*a[i]*a[i]*a[i]*req_7f; } -> 실행 시간 111. ms

C 언어의 연산 병목 a[0] b[0] + a[1] b[1] + a[2] b[2] + a[3] b[3] + a[4] + a[0] a[1] a[2] a[3] a[4] b[0] b[1] b[2] b[3] + + + + +

SSE 버젼의 sinf() // sin(a) = a – (a^3)/3! + (a^5)/5! – (a^7)/7! … __m128 req_3f4 = _mm_set1_ps(req_3f); __m128 req_5f4 = _mm_set1_ps(req_5f); __m128 req_7f4 = _mm_set1_ps(req_7f); for(size_ti=0; i<count/4; ++i) { b4[i] = a4[i] - a4[i]*a4[i]*a4[i]*req_3f4 + a4[i]*a4[i]*a4[i]*a4[i]*a4[i]*req_5f4 - a4[i]*a4[i]*a4[i]*a4[i]*a4[i]*a4[i]*a4[i]*req_7f4; } -> 실행 시간 48.939 ms

SSE는 아직도 메모리 병목!! a[0,1,2,3] + b[0,1,2,3] a[4,5,6,7] + b[4,5,6,7] a[8,9,10,11] + b[8,9,10,11] a[12,13,14,15] + b[12,13,14,15] a[16,17,18,19] a[0,1,2,3] b[0,1,2,3] a[4,5,6,7] b[4,5,6,7] a[8,9,10,11] b[8,9,10,11] a[12,13,14,15] b[12,13,14,15] a[16,17,18,19] + + + +

a+a과 sin() 연산 시간이 같다 ? ,[object Object]

SSE 에서 a4[i] + b4[i] 를 구성하는데 6 명령어로 실행되었고 sin() 은 29 명령어로 실행,[object Object]

그러나 컴파일러는 Vectorization을 잘 못한다.,[object Object]

_mm_stream_ps() // C 버젼 for(size_ti=0; i<count;++i) { b[i] = a[i] + a[i]; } -> 실행 시간 49.267 ms // a+a stream 버젼 for(size_ti=0; i<count/4;++i) { _mm_stream_ps((float*)(b4+i), _mm_add_ps(a4[i], a4[i])); } -> 실행 시간 30.114 ms

CPU _mm_stream_ps() 의 작동 Excution Unit L1 Cache L2 Cache WC Buffer Memory BUS Memory

_mm_stream_ps() 는 빠르다 !! Move Aligned Four Packed Single-FP Non Temporal ,[object Object]

쓰기 순서를 보장하지 않으므로 쓰고 바로 읽으면 안됨,[object Object]

Stream 을 추가한 그래프 !!

같은 시간에 더 많은 일을 합시다!! float Read + Write 시간 : 2.896 ns __m128 Read + Write 시간 : 11.214 ns __m128 Read + Stream 시간 : 6.977 ns

SSE 프로그래밍 메모리 접근 시간이 길어지고 연산시간이 짧아짐에 따라 더 많은 계산을 할 수 있다. 요즘 CPU는 Out-of-Order 로 인해 대부분 비동기 실행을 한다. 적극 이용하자. 병렬화와 병목 문제는 GPGPU 연산에도 동일하게 적용된다. 미래를 대비하자.!!

SSE 를 사용한 CPU Skinning Vertex : 1024 * 1024 Bone : 200 4 weight per vertex + normal + tangent SSE 컴파일 옵션이 켜진 C, SSE최적화 스키닝 없는 C 루프 복사, SSE 루프 복사, memcpy()

C Skinning Code // Optimized C Version D3DXMATRIX m = b[in->index[0]] * in->blend[0] + b[in->index[1]] * in->blend[1] + b[in->index[2]] * in->blend[2] + b[in->index[3]] * in->blend[3]; out->position.x = in->position.x*m._11 + in->position.y*m._21 + in->position.z*m._31 + m._41; out->position.y = in->position.x*m._12 + in->position.y*m._22 + in->position.z*m._32 + m._42; out->position.z = in->position.x*m._13 + in->position.y*m._23 + in->position.z*m._33 + m._43; out->normal.x = in->normal.x*m._11 + in->normal.y*m._21 + in->normal.z*m._31; out->normal.y = in->normal.x*m._12 + in->normal.y*m._22 + in->normal.z*m._32; out->normal.z = in->normal.x*m._13 + in->normal.y*m._23 + in->normal.z*m._33; out->tangent.x = in->tangent.x*m._11 + in->tangent.y*m._21 + in->tangent.z*m._31; out->tangent.y = in->tangent.x*m._12 + in->tangent.y*m._22 + in->tangent.z*m._32; out->tangent.z = in->tangent.x*m._13 + in->tangent.y*m._23 + in->tangent.z*m._33;

SSE Skinning Code // SSE Code __m128 b0 = _mm_set_ps1(in->blend[0]); __m128 b1 = _mm_set_ps1(in->blend[1]); __m128 b2 = _mm_set_ps1(in->blend[2]); __m128 b3 = _mm_set_ps1(in->blend[3]); __m128* m[4] = { (__m128*)( matrix+in->index[0] ), (__m128*)( matrix+in->index[1] ), (__m128*)( matrix+in->index[2] ), (__m128*)( matrix+in->index[3] ) }; __m128 m0 = m[0][0]*b0 + m[1][0]*b1 + m[2][0]*b2 + m[3][0]*b3; __m128 m1 = m[0][1]*b0 + m[1][1]*b1 + m[2][1]*b2 + m[3][1]*b3; __m128 m2 = m[0][2]*b0 + m[1][2]*b1 + m[2][2]*b2 + m[3][2]*b3; __m128 m3 = m[0][3]*b0 + m[1][3]*b1 + m[2][3]*b2 + m[3][3]*b3; _mm_stream_ps( out->position, m0*in->position.x+m1*in->position.y+m2*in->position.z+m3 ); _mm_stream_ps( out->normal, m0*in->normal.x+m1*in->normal.y+m2*in->normal.z ); _mm_stream_ps( out->tangent, m0*in->tangent.x+m1*in->tangent.y+m2*in->tangent.z );

SSE Skinning 결과 memcpy() 시간의 80% 로 스키닝을 할 수 있다. 파티클, UI 등에 유용하게 사용할 수있다. Dynamic VB 를 쓰는 동안 계산을 추가로 할 수 있다.

SSE를 사용한 KdTree ,[object Object]

Deep-Narrow Tree 를 만들어야 효율이 좋아지므로 노드가 무척 많아진다.

Tree Node 방문이 전체 처리 시간의 90% 을 차지한다.,[object Object]

Scaleform과 SSE Flash 파일을 3D 가속을 받으며 실행 가능하도록 만들어진 라이브러리 Direct3D/OpenGL 및 다양한 렌더링 라이브러리 지원 현재 프로젝트의 UI 제작에 사용 209개 파일 65147 Line 의 Acton Script 와 DXT5 79MB UI 이미지

Scaleform 3.1 의 문제점 복잡한 swf들을 다수 사용할 경우 CPU 사용률이 상당히 높다. 높은 자유도가 GPU에 최적화 되기 어려운 UI 를 만들게 한다. GRendererD3D9 은예제 코드에 가깝고 개발시 H/W 특성이 고려되지 않았다.

Scaleform개선 방향 Client GFx Client GFx GFxQueue Direct3D Direct3D GFxMoveView::Advance() GFxMoveView::Advance() SceneMgr::DrawScene() GFxMoveView::DisplayMT() SceneMgr::DrawScene() GFxQueue::DrawPrim() GFxMoveView::Display() GFxQueue::Flush() ID3DDevice::DrawPrim() 5~15ms/frame ID3DDevice::DrawPrim()

GFxQueue의 Batch 합치기 기능 Batch 합치기를 하기 위해 Vertex 를 Queue 에 넣을때 Transform (TnL) 을 미리 처리 Render State, Texture State 를 체크해서 중복된 렌더링 재설정을 방지 Scene 에서 벗어난 Shape 들안그리는 기능 추가 CPU로 대체된 VertexShader는 삭제, Pixel Shader도 Batch 합치기를 위해 수정

Transform 코드 caseVS_XY16iCF32: { XY16iCF32_VERTEX* input = (XY16iCF32_VERTEX*)src + start; for(UINT i=0; i<count; ++i){ //output->pos.x = g_x + (input->x * vertexShaderConstant[0].x + input->y * vertexShaderConstant[1].x + vertexShaderConstant[2].x) * g_width; //output->pos.y = g_y - (input->x * vertexShaderConstant[0].y + input->y * vertexShaderConstant[1].y + vertexShaderConstant[2].y) * g_height; //output->pos.z = 1; //output->pos.w = 1; //output->color = FlipColor(input->color); //output->factor = FlipColor(input->factor); //output->tc0.x = input->x * vertexShaderConstant[3].x + input->y * vertexShaderConstant[4].x + vertexShaderConstant[5].x; //output->tc0.y = input->x * vertexShaderConstant[3].y + input->y * vertexShaderConstant[4].y + vertexShaderConstant[5].y; //aabb.AddPoint(output->pos); __m128 pos = g_pos + ( input->x*vertexShaderConstant[0] + input->y*vertexShaderConstant[1] + vertexShaderConstant[2] ) * g_size; _mm_storeu_ps(output->pos, pos); __m128i colors = _mm_loadl_epi64((__m128i*)&input->color); __m128iunpack = _mm_unpacklo_epi8(colors, g_zero); __m128ishuffle = _mm_shufflelo_epi16(unpack, _MM_SHUFFLE(3,0,1,2)); shuffle = _mm_shufflehi_epi16(shuffle, _MM_SHUFFLE(3,0,1,2)); __m128ipacked = _mm_packus_epi16(shuffle, g_zero); _mm_storel_epi64((__m128i*)&output->color, packed); __m128tc = input->x*vertexShaderConstant[3] + input->y*vertexShaderConstant[4] + vertexShaderConstant[5]; _mm_storeu_ps(output->tc0, tc); aabb_min = _mm_min_ps(aabb_min, pos); aabb_max = _mm_max_ps(aabb_max, pos); ++output; ++input; } }

이권일 Sse 를 이용한 최적화와 실제 사용 예

Recommended

Recommended

More Related Content

What's hot

What's hot (20)

Viewers also liked

Viewers also liked (20)

Similar to 이권일 Sse 를 이용한 최적화와 실제 사용 예

Similar to 이권일 Sse 를 이용한 최적화와 실제 사용 예 (20)

Recently uploaded

Recently uploaded (7)

이권일 Sse 를 이용한 최적화와 실제 사용 예

Editor's Notes