Skip to content

Commit 0c75f27

Browse files
Moved to NEON optimized memcpy usage.
1 parent 9895189 commit 0c75f27

File tree

12 files changed

+227
-82
lines changed

12 files changed

+227
-82
lines changed

Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,10 @@ SOURCES += source/hacks
77
endif
88

99
CFILES := $(foreach dir,$(SOURCES), $(wildcard $(dir)/*.c))
10+
ASMFILES := $(foreach dir,$(SOURCES), $(wildcard $(dir)/*.S))
1011
CGFILES := $(foreach dir,$(SHADERS), $(wildcard $(dir)/*.cg))
1112
HEADERS := $(CGFILES:.cg=.h)
12-
OBJS := $(CFILES:.c=.o)
13+
OBJS := $(CFILES:.c=.o) $(ASMFILES:.S=.o)
1314

1415
PREFIX = arm-vita-eabi
1516
CC = $(PREFIX)-gcc

source/custom_shaders.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ void glShaderBinary(GLsizei count, const GLuint *handles, GLenum binaryFormat, c
159159

160160
// Allocating compiled shader on RAM and registering it into sceGxmShaderPatcher
161161
s->prog = (SceGxmProgram *)malloc(length);
162-
memcpy((void *)s->prog, binary, length);
162+
memcpy_neon((void *)s->prog, binary, length);
163163
sceGxmShaderPatcherRegisterProgram(gxm_shader_patcher, s->prog, &s->id);
164164
s->prog = sceGxmShaderPatcherGetProgramFromId(s->id);
165165
}
@@ -449,13 +449,13 @@ void vglVertexAttribPointer(GLuint index, GLint size, GLenum type, GLboolean nor
449449

450450
// Copying passed data to vitaGL mempool
451451
if (stride == 0)
452-
memcpy(ptr, pointer, count * bpe * size); // Faster if stride == 0
452+
memcpy_neon(ptr, pointer, count * bpe * size); // Faster if stride == 0
453453
else {
454454
int i;
455455
uint8_t *dst = (uint8_t *)ptr;
456456
uint8_t *src = (uint8_t *)pointer;
457457
for (i = 0; i < count; i++) {
458-
memcpy(dst, src, bpe * size);
458+
memcpy_neon(dst, src, bpe * size);
459459
dst += (bpe * size);
460460
src += stride;
461461
}

source/hacks/memcpy_neon.S

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
/*
2+
* NEON code contributed by Siarhei Siamashka <[email protected]>.
3+
* Origin: http://sourceware.org/ml/libc-ports/2009-07/msg00003.html
4+
*
5+
* The GNU C Library is free software; you can redistribute it and/or
6+
* modify it under the terms of the GNU Lesser General Public License.
7+
*
8+
* Tweaked for Android by Jim Huang <[email protected]>
9+
*/
10+
11+
.arm
12+
.fpu neon
13+
14+
@ void* memcpy_n(void *destination, const void *source, size_t num)
15+
.global memcpy_neon
16+
.type memcpy_neon, %function
17+
/*
18+
* ENABLE_UNALIGNED_MEM_ACCESSES macro can be defined to permit the use
19+
* of unaligned load/store memory accesses supported since ARMv6. This
20+
* will further improve performance, but can purely theoretically cause
21+
* problems if somebody decides to set SCTLR.A bit in the OS kernel
22+
* (to trap each unaligned memory access) or somehow mess with strongly
23+
* ordered/device memory.
24+
*/
25+
#define ENABLE_UNALIGNED_MEM_ACCESSES 1
26+
27+
#define NEON_MAX_PREFETCH_DISTANCE 320
28+
29+
.align 4
30+
memcpy_neon:
31+
.fnstart
32+
mov ip, r0
33+
cmp r2, #16
34+
blt 4f @ Have less than 16 bytes to copy
35+
36+
@ First ensure 16 byte alignment for the destination buffer
37+
tst r0, #0xF
38+
beq 2f
39+
tst r0, #1
40+
ldrneb r3, [r1], #1
41+
strneb r3, [ip], #1
42+
subne r2, r2, #1
43+
tst ip, #2
44+
#ifdef ENABLE_UNALIGNED_MEM_ACCESSES
45+
ldrneh r3, [r1], #2
46+
strneh r3, [ip], #2
47+
#else
48+
ldrneb r3, [r1], #1
49+
strneb r3, [ip], #1
50+
ldrneb r3, [r1], #1
51+
strneb r3, [ip], #1
52+
#endif
53+
subne r2, r2, #2
54+
55+
tst ip, #4
56+
beq 1f
57+
vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
58+
vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [ip, :32]!
59+
sub r2, r2, #4
60+
1:
61+
tst ip, #8
62+
beq 2f
63+
vld1.8 {d0}, [r1]!
64+
vst1.8 {d0}, [ip, :64]!
65+
sub r2, r2, #8
66+
2:
67+
subs r2, r2, #32
68+
blt 3f
69+
mov r3, #32
70+
71+
@ Main copy loop, 32 bytes are processed per iteration.
72+
@ ARM instructions are used for doing fine-grained prefetch,
73+
@ increasing prefetch distance progressively up to
74+
@ NEON_MAX_PREFETCH_DISTANCE at runtime
75+
1:
76+
vld1.8 {d0-d3}, [r1]!
77+
cmp r3, #(NEON_MAX_PREFETCH_DISTANCE - 32)
78+
pld [r1, r3]
79+
addle r3, r3, #32
80+
vst1.8 {d0-d3}, [ip, :128]!
81+
sub r2, r2, #32
82+
cmp r2, r3
83+
bge 1b
84+
cmp r2, #0
85+
blt 3f
86+
1: @ Copy the remaining part of the buffer (already prefetched)
87+
vld1.8 {d0-d3}, [r1]!
88+
subs r2, r2, #32
89+
vst1.8 {d0-d3}, [ip, :128]!
90+
bge 1b
91+
3: @ Copy up to 31 remaining bytes
92+
tst r2, #16
93+
beq 4f
94+
vld1.8 {d0, d1}, [r1]!
95+
vst1.8 {d0, d1}, [ip, :128]!
96+
4:
97+
@ Use ARM instructions exclusively for the final trailing part
98+
@ not fully fitting into full 16 byte aligned block in order
99+
@ to avoid "ARM store after NEON store" hazard. Also NEON
100+
@ pipeline will be (mostly) flushed by the time when the
101+
@ control returns to the caller, making the use of NEON mostly
102+
@ transparent (and avoiding hazards in the caller code)
103+
104+
#ifdef ENABLE_UNALIGNED_MEM_ACCESSES
105+
movs r3, r2, lsl #29
106+
ldrcs r3, [r1], #4
107+
strcs r3, [ip], #4
108+
ldrcs r3, [r1], #4
109+
strcs r3, [ip], #4
110+
ldrmi r3, [r1], #4
111+
strmi r3, [ip], #4
112+
movs r2, r2, lsl #31
113+
ldrcsh r3, [r1], #2
114+
strcsh r3, [ip], #2
115+
ldrmib r3, [r1], #1
116+
strmib r3, [ip], #1
117+
#else
118+
movs r3, r2, lsl #29
119+
bcc 1f
120+
.rept 8
121+
ldrcsb r3, [r1], #1
122+
strcsb r3, [ip], #1
123+
.endr
124+
1:
125+
bpl 1f
126+
.rept 4
127+
ldrmib r3, [r1], #1
128+
strmib r3, [ip], #1
129+
.endr
130+
1:
131+
movs r2, r2, lsl #31
132+
ldrcsb r3, [r1], #1
133+
strcsb r3, [ip], #1
134+
ldrcsb r3, [r1], #1
135+
strcsb r3, [ip], #1
136+
ldrmib r3, [r1], #1
137+
strmib r3, [ip], #1
138+
#endif
139+
bx lr
140+
.fnend

source/legacy.c

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ void glVertex3f(GLfloat x, GLfloat y, GLfloat z) {
107107
last_vert->v.x = x;
108108
last_vert->v.y = y;
109109
last_vert->v.z = z;
110-
memcpy(&last_clr->v, &current_color.r, sizeof(vector4f));
110+
memcpy_neon(&last_clr->v, &current_color.r, sizeof(vector4f));
111111
last_clr->next = last_vert->next = NULL;
112112

113113
// Increasing vertex counter
@@ -135,8 +135,8 @@ void glVertex3fv(const GLfloat *v) {
135135
}
136136

137137
// Properly populating the new element
138-
memcpy(&last_vert->v, v, sizeof(vector3f));
139-
memcpy(&last_clr->v, &current_color.r, sizeof(vector4f));
138+
memcpy_neon(&last_vert->v, v, sizeof(vector3f));
139+
memcpy_neon(&last_clr->v, &current_color.r, sizeof(vector4f));
140140
last_clr->next = last_vert->next = NULL;
141141

142142
// Increasing vertex counter
@@ -157,7 +157,7 @@ void glColor3f(GLfloat red, GLfloat green, GLfloat blue) {
157157

158158
void glColor3fv(const GLfloat *v) {
159159
// Setting current color value
160-
memcpy(&current_color.r, v, sizeof(vector3f));
160+
memcpy_neon(&current_color.r, v, sizeof(vector3f));
161161
current_color.a = 1.0f;
162162
}
163163

@@ -187,7 +187,7 @@ void glColor4f(GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha) {
187187

188188
void glColor4fv(const GLfloat *v) {
189189
// Setting current color value
190-
memcpy(&current_color.r, v, sizeof(vector4f));
190+
memcpy_neon(&current_color.r, v, sizeof(vector4f));
191191
}
192192

193193
void glColor4ub(GLubyte red, GLubyte green, GLubyte blue, GLubyte alpha) {
@@ -309,7 +309,7 @@ void glArrayElement(GLint i) {
309309
last_clr->next = NULL;
310310

311311
// Populating new vertex element
312-
memcpy(&last_vert->v, ptr, tex_unit->vertex_array.size * tex_unit->vertex_array.num);
312+
memcpy_neon(&last_vert->v, ptr, tex_unit->vertex_array.size * tex_unit->vertex_array.num);
313313

314314
// Checking if current texture unit has GL_COLOR_ARRAY enabled
315315
if (tex_unit->color_array_state) {
@@ -322,11 +322,11 @@ void glArrayElement(GLint i) {
322322

323323
// Populating new color element
324324
last_clr->v.a = 1.0f;
325-
memcpy(&last_clr->v, ptr_clr, tex_unit->color_array.size * tex_unit->color_array.num);
325+
memcpy_neon(&last_clr->v, ptr_clr, tex_unit->color_array.size * tex_unit->color_array.num);
326326

327327
} else {
328328
// Populating new color element with current color
329-
memcpy(&last_clr->v, &current_color.r, sizeof(vector4f));
329+
memcpy_neon(&last_clr->v, &current_color.r, sizeof(vector4f));
330330
}
331331

332332
// Checking if current texture unit has GL_TEXTURE_COORD_ARRAY enabled
@@ -347,7 +347,7 @@ void glArrayElement(GLint i) {
347347
}
348348

349349
// Populating new texcoord element
350-
memcpy(&last_uv->v, ptr_tex, tex_unit->vertex_array.size * 2);
350+
memcpy_neon(&last_uv->v, ptr_tex, tex_unit->vertex_array.size * 2);
351351
last_uv->next = NULL;
352352
}
353353
}
@@ -498,8 +498,8 @@ void glEnd(void) {
498498
memset(vertices, 0, (vertex_count * sizeof(vector3f)));
499499
indices = (uint16_t *)gpu_pool_memalign(idx_count * sizeof(uint16_t), sizeof(uint16_t));
500500
for (i = 0; i < vertex_count; i++) {
501-
memcpy(&vertices[n], &object->v, sizeof(vector3f));
502-
memcpy(&uv_map[n], &object_uv->v, sizeof(vector2f));
501+
memcpy_neon(&vertices[n], &object->v, sizeof(vector3f));
502+
memcpy_neon(&uv_map[n], &object_uv->v, sizeof(vector2f));
503503
indices[n] = n;
504504
object = object->next;
505505
object_uv = object_uv->next;
@@ -522,8 +522,8 @@ void glEnd(void) {
522522
indices[i * 6 + 5] = i * 4 + 3;
523523
}
524524
for (j = 0; j < vertex_count; j++) {
525-
memcpy(&vertices[j], &object->v, sizeof(vector3f));
526-
memcpy(&uv_map[j], &object_uv->v, sizeof(vector2f));
525+
memcpy_neon(&vertices[j], &object->v, sizeof(vector3f));
526+
memcpy_neon(&uv_map[j], &object_uv->v, sizeof(vector2f));
527527
object = object->next;
528528
object_uv = object_uv->next;
529529
}
@@ -554,8 +554,8 @@ void glEnd(void) {
554554
memset(vertices, 0, (vertex_count * sizeof(vector3f)));
555555
indices = (uint16_t *)gpu_pool_memalign(idx_count * sizeof(uint16_t), sizeof(uint16_t));
556556
for (i = 0; i < vertex_count; i++) {
557-
memcpy(&vertices[n], &object->v, sizeof(vector3f));
558-
memcpy(&colors[n], &object_clr->v, sizeof(vector4f));
557+
memcpy_neon(&vertices[n], &object->v, sizeof(vector3f));
558+
memcpy_neon(&colors[n], &object_clr->v, sizeof(vector4f));
559559
indices[n] = n;
560560
object = object->next;
561561
object_clr = object_clr->next;
@@ -579,8 +579,8 @@ void glEnd(void) {
579579
indices[i * 6 + 5] = i * 4 + 3;
580580
}
581581
for (j = 0; j < vertex_count; j++) {
582-
memcpy(&vertices[j], &object->v, sizeof(vector3f));
583-
memcpy(&colors[j], &object_clr->v, sizeof(vector4f));
582+
memcpy_neon(&vertices[j], &object->v, sizeof(vector3f));
583+
memcpy_neon(&colors[j], &object_clr->v, sizeof(vector4f));
584584
object = object->next;
585585
object_clr = object_clr->next;
586586
}

source/misc.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -504,7 +504,7 @@ void glFogfv(GLenum pname, const GLfloat *params) {
504504
fog_far = params[0];
505505
break;
506506
case GL_FOG_COLOR:
507-
memcpy(&fog_color.r, params, sizeof(vector4f));
507+
memcpy_neon(&fog_color.r, params, sizeof(vector4f));
508508
break;
509509
default:
510510
vgl_error = GL_INVALID_ENUM;
@@ -545,7 +545,7 @@ void glClipPlane(GLenum plane, const GLdouble *equation) {
545545
matrix4x4_transpose(inverted_transposed, inverted);
546546
vector4f temp;
547547
vector4f_matrix4x4_mult(&temp, inverted_transposed, &clip_plane0_eq);
548-
memcpy(&clip_plane0_eq.x, &temp.x, sizeof(vector4f));
548+
memcpy_neon(&clip_plane0_eq.x, &temp.x, sizeof(vector4f));
549549
break;
550550
default:
551551
vgl_error = GL_INVALID_ENUM;

0 commit comments

Comments
 (0)