luckfox-pico-sdk/sysdrv/source/mcu/rt-thread/components/benchmark/tinymembench/x86-sse2.S
luckfox-eng29 8f34c2760d project:build.sh: Added fastboot support; custom modifications to U-Boot and kernel implemented using patches.
project:cfg:BoardConfig_IPC: Added fastboot BoardConfig file and firmware post-scripts, distinguishing between
the BoardConfigs for Luckfox Pico Pro and Luckfox Pico Max. project:app: Added fastboot_client and rk_smart_door
for quick boot applications; updated rkipc app to adapt to the latest media library. media:samples: Added more
usage examples. media:rockit: Fixed bugs; removed support for retrieving data frames from VPSS. media:isp:
Updated rkaiq library and related tools to support connection to RKISP_Tuner. sysdrv:Makefile: Added support for
compiling drv_ko on Luckfox Pico Ultra W using Ubuntu; added support for custom root filesystem.
sysdrv:tools:board: Updated Buildroot optional mirror sources, updated some software versions, and stored device
tree files and configuration files that undergo multiple modifications for U-Boot and kernel separately.
sysdrv:source:mcu: Used RISC-V MCU SDK with RT-Thread system, mainly for initializing camera AE during quick
boot. sysdrv:source:uboot: Added support for fastboot; added high baud rate DDR bin for serial firmware upgrades.
sysdrv:source:kernel: Upgraded to version 5.10.160; increased NPU frequency for RV1106G3; added support for
fastboot.

Signed-off-by: luckfox-eng29 <eng29@luckfox.com>
2024-10-14 09:47:04 +08:00

252 lines
6.4 KiB
ArmAsm

/*
* Copyright © 2011 Siarhei Siamashka <siarhei.siamashka@gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#if defined(__i386__) || defined(__amd64__)
.intel_syntax noprefix
.text
#define PREFETCH_DISTANCE 256
.macro asm_function_helper function_name
.global \function_name
.func \function_name
\function_name:
#ifdef __amd64__
#ifdef _WIN64
.set DST, rcx
.set SRC, rdx
.set SIZE, r8
#else
.set DST, rdi
.set SRC, rsi
.set SIZE, rdx
#endif
#else
mov eax, [esp + 4]
mov ecx, [esp + 8]
mov edx, [esp + 12]
.set DST, eax
.set SRC, ecx
.set SIZE, edx
#endif
.endm
.macro asm_function function_name
#if defined(_WIN32) && !defined(_WIN64)
asm_function_helper _\function_name
#else
asm_function_helper \function_name
#endif
.endm
.macro push3 a, b, c
push \a
push \b
push \c
.endm
.macro pop3 a, b, c
pop \c
pop \b
pop \a
.endm
/*****************************************************************************/
asm_function aligned_block_copy_movsb
0:
#ifdef __amd64__
push3 rdi rsi rcx
push3 DST SRC SIZE
pop3 rdi rsi rcx
rep movsb
pop3 rdi rsi rcx
#else
push3 edi esi ecx
push3 DST SRC SIZE
pop3 edi esi ecx
rep movsb
pop3 edi esi ecx
#endif
ret
.endfunc
asm_function aligned_block_copy_movsd
0:
#ifdef __amd64__
push3 rdi rsi rcx
push3 DST SRC SIZE
pop3 rdi rsi rcx
sar rcx, 2
rep movsd
pop3 rdi rsi rcx
#else
push3 edi esi ecx
push3 DST SRC SIZE
pop3 edi esi ecx
sar ecx, 2
rep movsd
pop3 edi esi ecx
#endif
ret
.endfunc
asm_function aligned_block_copy_sse2
0:
movdqa xmm0, [SRC + 0]
movdqa xmm1, [SRC + 16]
movdqa xmm2, [SRC + 32]
movdqa xmm3, [SRC + 48]
movdqa [DST + 0], xmm0
movdqa [DST + 16], xmm1
movdqa [DST + 32], xmm2
movdqa [DST + 48], xmm3
add SRC, 64
add DST, 64
sub SIZE, 64
jg 0b
ret
.endfunc
asm_function aligned_block_copy_nt_sse2
0:
movdqa xmm0, [SRC + 0]
movdqa xmm1, [SRC + 16]
movdqa xmm2, [SRC + 32]
movdqa xmm3, [SRC + 48]
movntdq [DST + 0], xmm0
movntdq [DST + 16], xmm1
movntdq [DST + 32], xmm2
movntdq [DST + 48], xmm3
add SRC, 64
add DST, 64
sub SIZE, 64
jg 0b
ret
.endfunc
asm_function aligned_block_copy_pf32_sse2
0:
prefetchnta [SRC + PREFETCH_DISTANCE]
prefetchnta [SRC + PREFETCH_DISTANCE + 32]
movdqa xmm0, [SRC + 0]
movdqa xmm1, [SRC + 16]
movdqa xmm2, [SRC + 32]
movdqa xmm3, [SRC + 48]
movdqa [DST + 0], xmm0
movdqa [DST + 16], xmm1
movdqa [DST + 32], xmm2
movdqa [DST + 48], xmm3
add SRC, 64
add DST, 64
sub SIZE, 64
jg 0b
ret
.endfunc
asm_function aligned_block_copy_nt_pf32_sse2
0:
prefetchnta [SRC + PREFETCH_DISTANCE]
prefetchnta [SRC + PREFETCH_DISTANCE + 32]
movdqa xmm0, [SRC + 0]
movdqa xmm1, [SRC + 16]
movdqa xmm2, [SRC + 32]
movdqa xmm3, [SRC + 48]
movntdq [DST + 0], xmm0
movntdq [DST + 16], xmm1
movntdq [DST + 32], xmm2
movntdq [DST + 48], xmm3
add SRC, 64
add DST, 64
sub SIZE, 64
jg 0b
ret
.endfunc
asm_function aligned_block_copy_pf64_sse2
0:
prefetchnta [SRC + PREFETCH_DISTANCE]
movdqa xmm0, [SRC + 0]
movdqa xmm1, [SRC + 16]
movdqa xmm2, [SRC + 32]
movdqa xmm3, [SRC + 48]
movdqa [DST + 0], xmm0
movdqa [DST + 16], xmm1
movdqa [DST + 32], xmm2
movdqa [DST + 48], xmm3
add SRC, 64
add DST, 64
sub SIZE, 64
jg 0b
ret
.endfunc
asm_function aligned_block_copy_nt_pf64_sse2
0:
prefetchnta [SRC + PREFETCH_DISTANCE]
movdqa xmm0, [SRC + 0]
movdqa xmm1, [SRC + 16]
movdqa xmm2, [SRC + 32]
movdqa xmm3, [SRC + 48]
movntdq [DST + 0], xmm0
movntdq [DST + 16], xmm1
movntdq [DST + 32], xmm2
movntdq [DST + 48], xmm3
add SRC, 64
add DST, 64
sub SIZE, 64
jg 0b
ret
.endfunc
asm_function aligned_block_fill_sse2
movdqa xmm0, [SRC + 0]
0:
movdqa [DST + 0], xmm0
movdqa [DST + 16], xmm0
movdqa [DST + 32], xmm0
movdqa [DST + 48], xmm0
add DST, 64
sub SIZE, 64
jg 0b
ret
.endfunc
asm_function aligned_block_fill_nt_sse2
movdqa xmm0, [SRC + 0]
0:
movntdq [DST + 0], xmm0
movntdq [DST + 16], xmm0
movntdq [DST + 32], xmm0
movntdq [DST + 48], xmm0
add DST, 64
sub SIZE, 64
jg 0b
ret
.endfunc
/*****************************************************************************/
#endif