Get the project compiling on Apple Silicon macOS natively (arm64) (#2827)

I havn't tested it yet, but I can almost guarantee that atleast `goalc`
will not work in the slightest!

But the project is atleast fully compiling. My hope is to start
translating some AVX to NEON next / get `goalc` working...eventually.
This commit is contained in:
Tyler Wilding 2023-07-16 09:13:48 -06:00 committed by GitHub
parent 54b2c5dcbd
commit e0bc7ce732
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
30 changed files with 9860 additions and 60 deletions

View file

@ -45,9 +45,18 @@ jobs:
secrets: inherit
# MacOS
build_macos_clang:
build_macos_intel:
name: "🍎 MacOS"
uses: ./.github/workflows/macos-build-clang.yaml
uses: ./.github/workflows/macos-build.yaml
with:
cmakePreset: "Release-macos-clang"
cachePrefix: ""
# Q4 2023 there will hopefully be native arm64 runners
# https://github.com/github/roadmap/issues/528
# build_macos_arm:
# name: "🍎 MacOS"
# uses: ./.github/workflows/macos-build-arm.yaml
# with:
# cmakePreset: "Release-macos-clang"
# cachePrefix: ""

66
.github/workflows/macos-build-arm.yaml vendored Normal file
View file

@ -0,0 +1,66 @@
name: MacOS Build
on:
workflow_call:
inputs:
cmakePreset:
required: true
type: string
cachePrefix:
required: true
type: string
jobs:
build:
name: ARM
runs-on: macos-latest
timeout-minutes: 120
env: # overrides: https://github.com/mbitsnbites/buildcache/blob/master/doc/configuration.md
BUILDCACHE_MAX_CACHE_SIZE: 1000000000 # 1gb
BUILDCACHE_COMPRESS_FORMAT: ZSTD
BUILDCACHE_COMPRESS_LEVEL: 19
BUILDCACHE_DIRECT_MODE: true
BUILDCACHE_LOG_FILE: ${{ github.workspace }}/buildcache.log
steps:
- name: Checkout Repository
uses: actions/checkout@v3
- name: Set up ARM64 environment
run: sudo softwareupdate --install-rosetta --agree-to-license
- name: Install Package Dependencies
run: arch -arm64 brew install cmake ninja
- name: Setup Buildcache
uses: mikehardy/buildcache-action@v2.1.0
with:
cache_key: macos-12-${{ inputs.cachePrefix }}-${{ inputs.cmakePreset }}
buildcache_tag: v0.28.1
- name: CMake Generation
env:
CC: clang
CXX: clang++
run: |
cmake -B build --preset=${{ inputs.cmakePreset }} \
-DCMAKE_C_COMPILER_LAUNCHER=${{ github.workspace }}/buildcache/bin/buildcache \
-DCMAKE_CXX_COMPILER_LAUNCHER=${{ github.workspace }}/buildcache/bin/buildcache
- name: Build Project
run: cmake --build build --parallel $((`sysctl -n hw.logicalcpu`))
- name: Run Tests
run: ./test.sh
- name: Upload artifact
uses: actions/upload-artifact@v3
with:
name: opengoal-macos-${{ inputs.cachePrefix }}
if-no-files-found: error
path: |
./build/goalc/goalc
./build/decompiler/extractor
./build/game/gk
./build/lsp/lsp

View file

@ -1,4 +1,4 @@
name: MacOS Build Clang
name: MacOS Build
on:
workflow_call:
@ -12,7 +12,7 @@ on:
jobs:
build:
name: Clang
name: Intel
runs-on: macos-12
timeout-minutes: 120
@ -49,9 +49,6 @@ jobs:
run: cmake --build build --parallel $((`sysctl -n hw.logicalcpu`))
- name: Run Tests
continue-on-error: true # until macOS is stable
env:
GTEST_OUTPUT: "xml:opengoal-test-report.xml"
run: ./test.sh
- name: Upload artifact

View file

@ -26,8 +26,8 @@ jobs:
# macOS
build_macos_clang:
name: "🍎 macOS - Intel"
uses: ./.github/workflows/macos-build-clang.yaml
name: "🍎 MacOS"
uses: ./.github/workflows/macos-build.yaml
with:
cmakePreset: "Release-macos-clang-static"
cachePrefix: "static"

View file

@ -22,6 +22,13 @@ else()
set(BUILD_SHARED_LIBS ON)
endif()
# For clangd
if (EXISTS "${CMAKE_CURRENT_BINARY_DIR}/compile_commands.json" )
configure_file(
"${CMAKE_CURRENT_BINARY_DIR}/compile_commands.json"
"${PROJECT_SOURCE_DIR}/build/compile_commands.json")
endif()
# Setup compiler flags
# TODO - consider moving most of the configuration into presets
# - https://cmake.org/cmake/help/latest/manual/cmake-presets.7.html
@ -112,7 +119,6 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
-Wredundant-decls \
-Wshadow \
-Wsign-promo \
-O3 \
-fdiagnostics-color=always"
)

View file

@ -14,7 +14,8 @@
"ASAN_BUILD": "OFF",
"STATICALLY_LINK": "OFF",
"ZYDIS_BUILD_SHARED_LIB": "ON",
"CODE_COVERAGE": "OFF"
"CODE_COVERAGE": "OFF",
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON"
}
},
{

View file

@ -29,6 +29,8 @@
- [MacOS](#macos)
- [Intel Based](#intel-based)
- [Apple Silicon](#apple-silicon)
- [VSCode](#vscode)
- [Building and Debugging](#building-and-debugging)
- [Building and Running the Game](#building-and-running-the-game)
- [Extract Assets](#extract-assets)
- [Build the Game](#build-the-game)
@ -244,14 +246,14 @@ Then build the entire project as `Windows Release (clang)`. You can also press C
Ensure that you have Xcode command line tools installed (this installs things like Apple Clang). If you don't, you can run the following command:
```bash
xcode-select install
xcode-select --install
```
#### Intel Based
```bash
brew install go-task/tap/go-task
brew install cmake nasm ninja go-task
brew install cmake nasm ninja go-task clang-format
cmake -B build --preset=Release-macos-clang
cmake --build build --parallel $((`sysctl -n hw.logicalcpu`))
```
@ -260,6 +262,28 @@ cmake --build build --parallel $((`sysctl -n hw.logicalcpu`))
**Not Supported at This Time**
```bash
brew install go-task/tap/go-task
brew install cmake ninja go-task clang-format
cmake -B build --preset=Release-macos-clang
cmake --build build --parallel $((`sysctl -n hw.logicalcpu`))
```
You may have to add the MacOS SDK to your `LIBRARY_PATH`:
- `export LIBRARY_PATH="$LIBRARY_PATH:/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/lib"`
### VSCode
If you either don't want to or cannot use Visual Studio for working with the C++ project, VSCode is a good alternatively.
The `clangd` extension is [recommended](https://marketplace.visualstudio.com/items?itemName=llvm-vs-code-extensions.vscode-clangd) and requires `clangd` to be on your `$PATH`. If you can run `clangd` in a terminal successfully then you should be good to go.
Once you generate your CMake for the first time the clangd LSP should be able to index the project and give you intellisense.
#### Building and Debugging
TODO - Consider Contributing Documentation :)
### Building and Running the Game
Getting a running game involves 4 steps:

View file

@ -3,7 +3,11 @@
#include <algorithm>
#include <functional>
#ifndef __aarch64__
#include "xmmintrin.h"
#else
#include "third-party/sse2neon/sse2neon.h"
#endif
#include "common/util/Assert.h"

View file

@ -2,7 +2,7 @@
#include <cstring>
#ifdef __arm__
#ifdef __aarch64__
#include <arm_acle.h>
u32 crc32(const u8* data, size_t size) {
u32 result = 0xffffffff;
@ -22,7 +22,6 @@ u32 crc32(const u8* data, size_t size) {
}
#else
#include <immintrin.h>
u32 crc32(const u8* data, size_t size) {
u32 result = 0xffffffff;
while (size >= 4) {

View file

@ -1,4 +1,7 @@
#pragma once
// clang-format off
// TODO - clang formatting is off in this file due to
// differences in newer versions of clang-format which we have not updates to yet
#if defined(__GNUC__)
#pragma GCC diagnostic push
@ -168,3 +171,4 @@ class LinkedWord {
#elif defined(__clang__)
#pragma clang diagnostic pop
#endif
// clang-format on

View file

@ -1,10 +1,30 @@
# We define our own compilation flags here.
set(CMAKE_CXX_STANDARD 17)
enable_language(ASM_NASM)
set(CMAKE_ASM_NASM_SOURCE_FILE_EXTENSIONS ${CMAKE_ASM_NASM_SOURCE_FILE_EXTENSIONS} asm)
set(CMAKE_ASM_NASM_COMPILE_OBJECT "<CMAKE_ASM_NASM_COMPILER> <INCLUDES> -f ${CMAKE_ASM_NASM_OBJECT_FORMAT} -o <OBJECT> <SOURCE>")
set_source_files_properties(kernel/asm_funcs.asm PROPERTIES COMPILE_FLAGS "-g")
# Set a more convenient ARM flag
if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
set(ARM64_ARCH TRUE)
message(STATUS "ARM64 architecture detected")
else()
set(ARM64_ARCH FALSE)
message(STATUS "Non-ARM64 architecture detected")
endif()
if(ARM64_ARCH)
# Add your ARM64-specific configuration or build options here
set(OG_ASM_FUNCS_FILE kernel/asm_funcs_arm64.s)
enable_language(ASM)
set(CMAKE_ASM_SOURCE_FILE_EXTENSIONS ${CMAKE_ASM_SOURCE_FILE_EXTENSIONS} s)
# set(CMAKE_ASM_COMPILE_OBJECT "${CMAKE_ASM_COMPILER} -o <OBJECT> <SOURCE>")
set_source_files_properties(${OG_ASM_FUNCS_FILE} PROPERTIES COMPILE_FLAGS "-g")
else()
set(OG_ASM_FUNCS_FILE kernel/asm_funcs_x86_64.asm)
enable_language(ASM_NASM)
set(CMAKE_ASM_NASM_SOURCE_FILE_EXTENSIONS ${CMAKE_ASM_NASM_SOURCE_FILE_EXTENSIONS} asm)
set(CMAKE_ASM_NASM_COMPILE_OBJECT "<CMAKE_ASM_NASM_COMPILER> <INCLUDES> -f ${CMAKE_ASM_NASM_OBJECT_FORMAT} -o <OBJECT> <SOURCE>")
set_source_files_properties(${OG_ASM_FUNCS_FILE} PROPERTIES COMPILE_FLAGS "-g")
endif()
set(RUNTIME_SOURCE
external/discord_jak1.cpp
external/discord_jak2.cpp
@ -69,7 +89,7 @@ set(RUNTIME_SOURCE
graphics/texture/jak2_tpage_dir.cpp
graphics/texture/TextureConverter.cpp
graphics/texture/TexturePool.cpp
kernel/asm_funcs.asm
${OG_ASM_FUNCS_FILE}
kernel/common/fileio.cpp
kernel/common/kboot.cpp
kernel/common/kdgo.cpp

View file

@ -1,7 +1,11 @@
#pragma once
#include <cfloat>
#include "immintrin.h"
#ifdef __aarch64__
#include "third-party/sse2neon/sse2neon.h"
#else
#include <immintrin.h>
#endif
#include "common/common_types.h"
#include "common/math/Vector.h"

View file

@ -1,7 +1,5 @@
#include "DirectRenderer2.h"
#include <immintrin.h>
#include "common/log/log.h"
#include "third-party/imgui/imgui.h"

View file

@ -1,6 +1,8 @@
#include "SkyBlendCPU.h"
#ifndef __aarch64__
#include <immintrin.h>
#endif
#include "common/util/os.h"
@ -23,6 +25,7 @@ SkyBlendCPU::~SkyBlendCPU() {
}
void blend_sky_initial_fast(u8 intensity, u8* out, const u8* in, u32 size) {
#ifndef __arm64__
if (get_cpu_info().has_avx2) {
#ifdef __AVX2__
__m256i intensity_vec = _mm256_set1_epi16(intensity);
@ -49,9 +52,11 @@ void blend_sky_initial_fast(u8 intensity, u8* out, const u8* in, u32 size) {
_mm_storel_epi64((__m128i*)(out + (i * 8)), result);
}
}
#endif
}
void blend_sky_fast(u8 intensity, u8* out, const u8* in, u32 size) {
#ifndef __arm64__
if (get_cpu_info().has_avx2) {
#ifdef __AVX2__
__m256i intensity_vec = _mm256_set1_epi16(intensity);
@ -86,9 +91,7 @@ void blend_sky_fast(u8 intensity, u8* out, const u8* in, u32 size) {
_mm_storel_epi64((__m128i*)(out + (i * 8)), out_val);
}
}
/*
*/
#endif
}
SkyBlendStats SkyBlendCPU::do_sky_blends(DmaFollower& dma,

View file

@ -25,4 +25,4 @@ class SkyBlendCPU {
u32 tbp;
GpuTexture* tex;
} m_textures[2];
};
};

View file

@ -243,7 +243,15 @@ void Shrub::render_tree(int idx,
}
Timer interp_timer;
interp_time_of_day_fast(settings.itimes, tree.tod_cache, m_color_result.data());
#ifndef __aarch64__
if (m_use_fast_time_of_day) {
interp_time_of_day_fast(settings.itimes, tree.tod_cache, m_color_result.data());
} else {
interp_time_of_day_slow(settings.itimes, *tree.colors, m_color_result.data());
}
#else
interp_time_of_day_slow(settings.itimes, *tree.colors, m_color_result.data());
#endif
tree.perf.tod_time.add(interp_timer.getSeconds());
Timer setup_timer;

View file

@ -68,6 +68,7 @@ class Shrub : public BucketRenderer {
static constexpr int TIME_OF_DAY_COLOR_COUNT = 8192;
bool m_has_level = false;
bool m_use_fast_time_of_day = true;
struct Cache {
std::vector<std::pair<int, int>> draw_idx_temp;

View file

@ -207,11 +207,15 @@ void Tfrag3::render_tree(int geom,
if (m_color_result.size() < tree.colors->size()) {
m_color_result.resize(tree.colors->size());
}
#ifndef __aarch64__
if (m_use_fast_time_of_day) {
interp_time_of_day_fast(itimes, tree.tod_cache, m_color_result.data());
interp_time_of_day_fast(settings.itimes, tree.tod_cache, m_color_result.data());
} else {
interp_time_of_day_slow(itimes, *tree.colors, m_color_result.data());
interp_time_of_day_slow(settings.itimes, *tree.colors, m_color_result.data());
}
#else
interp_time_of_day_slow(settings.itimes, *tree.colors, m_color_result.data());
#endif
glActiveTexture(GL_TEXTURE10);
glBindTexture(GL_TEXTURE_1D, tree.time_of_day_texture);
glTexSubImage1D(GL_TEXTURE_1D, 0, 0, tree.colors->size(), GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV,

View file

@ -434,11 +434,15 @@ void Tie3::setup_tree(int idx,
m_color_result.resize(tree.colors->size());
}
#ifndef __aarch64__
if (m_use_fast_time_of_day) {
interp_time_of_day_fast(settings.itimes, tree.tod_cache, m_color_result.data());
} else {
interp_time_of_day_slow(settings.itimes, *tree.colors, m_color_result.data());
}
#else
interp_time_of_day_slow(settings.itimes, *tree.colors, m_color_result.data());
#endif
glActiveTexture(GL_TEXTURE10);
glBindTexture(GL_TEXTURE_1D, tree.time_of_day_texture);

View file

@ -2,7 +2,11 @@
#include "background_common.h"
#ifdef __aarch64__
#include "third-party/sse2neon/sse2neon.h"
#else
#include <immintrin.h>
#endif
#include "common/util/os.h"
@ -274,6 +278,7 @@ SwizzledTimeOfDay swizzle_time_of_day(const std::vector<tfrag3::TimeOfDayColor>&
return out;
}
#ifndef __aarch64__
void interp_time_of_day_fast(const math::Vector<s32, 4> itimes[4],
const SwizzledTimeOfDay& swizzled_colors,
math::Vector<u8, 4>* out) {
@ -429,6 +434,7 @@ void interp_time_of_day_fast(const math::Vector<s32, 4> itimes[4],
}
}
}
#endif
bool sphere_in_view_ref(const math::Vector4f& sphere, const math::Vector4f* planes) {
math::Vector4f acc =

View file

@ -60,9 +60,11 @@ struct SwizzledTimeOfDay {
SwizzledTimeOfDay swizzle_time_of_day(const std::vector<tfrag3::TimeOfDayColor>& in);
#ifndef __aarch64__
void interp_time_of_day_fast(const math::Vector<s32, 4> itimes[4],
const SwizzledTimeOfDay& swizzled_colors,
math::Vector<u8, 4>* out);
#endif
void cull_check_all_slow(const math::Vector4f* planes,
const std::vector<tfrag3::VisNode>& nodes,

View file

@ -1,6 +1,10 @@
#include "Merc2.h"
#ifdef __aarch64__
#include "third-party/sse2neon/sse2neon.h"
#else
#include <xmmintrin.h>
#endif
#include "common/global_profiler/GlobalProfiler.h"

View file

@ -0,0 +1,280 @@
;; GOAL Runtime assembly functions. These exist only in the arm64 version of GOAL.
;; - https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms#Pass-arguments-to-functions-correctly
;; - https://en.wikipedia.org/wiki/Calling_convention#ARM_(A64)
;; - https://student.cs.uwaterloo.ca/~cs452/docs/rpi4b/aapcs64.pdf
;; - s16s31 (d8d15, q4q7) must be preserved
;; - s0s15 (d0d7, q0q3) and d16d31 (q8q15) do not need to be preserved
;; - https://devblogs.microsoft.com/oldnewthing/20220728-00/?p=106912
;; - ;; - https://courses.cs.washington.edu/courses/cse469/19wi/arm64.pdf
.text
;; Call C++ code on arm64 systems, from GOAL.
;; Following the macOS documentation which mostly aligns with standard arm64
.global _arg_call_arm64
.align 4
_arg_call_arm64:
stp x29, x30, [sp, #-16]!
mov x29, sp
ldr x8, [sp], #16
; Putting an exclamation point after the close-bracket
; means that the calculated effective address is written back to the base register. (pre-indexing)
stp q15, q14, [sp, #-32]!
stp q13, q12, [sp, #-32]!
stp q11, q10, [sp, #-32]!
stp q9, q8, [sp, #-32]!
blr x8
ldp q9, q8, [sp], #32
ldp q10, q11, [sp], #32
ldp q12, q13, [sp], #32
ldp q14, q15, [sp], #32
ldp x29, x30, [sp], #16
ret
;; Call C++ code on arm64 systems, from GOAL.
;;
;; Put arguments on the stack and put a pointer to this array in the first arg.
;; this function pushes all 8 OpenGOAL registers into a stack array.
;; then it calls the function pointed to by x0 (RAX in x86) with a pointer to this array.
;; it returns the return value of the called function.
.global _stack_call_arm64
.align 4
_stack_call_arm64:
stp x29, x30, [sp, #-16]!
mov x29, sp
ldr x8, [sp], #16
stp q15, q14, [sp, #-32]!
stp q13, q12, [sp, #-32]!
stp q11, q10, [sp, #-32]!
stp q9, q8, [sp, #-32]!
; create stack array of arguments
; arg 7 (R11 in x86)
; arg 6 (R10 in x86)
; arg 5 (R8 in x86)
; arg 4 (R8 in x86)
; arg 3 (RCX in x86)
; arg 2 (RDX in x86)
; arg 1 (RSI in x86)
; arg 0 (RDI in x86)
stp x7, x6, [sp, #-16]!
stp x5, x4, [sp, #-16]!
stp x3, x2, [sp, #-16]!
stp x1, x0, [sp, #-16]!
; set first argument
mov x19, sp
; call function
blr x8
; restore arguments
ldp x1, x0, [sp], #16
ldp x3, x2, [sp], #16
ldp x5, x4, [sp], #16
ldp x7, x6, [sp], #16
ldp q9, q8, [sp], #32
ldp q10, q11, [sp], #32
ldp q12, q13, [sp], #32
ldp q14, q15, [sp], #32
ldp x29, x30, [sp], #16
; return!
ret
;; Call c++ code through mips2c.
;; GOAL will call a dynamically generated trampoline.
;; The trampoline will have pushed the exec function and stack offset onto the stack
.global _mips2c_call_arm64
.align 4
_mips2c_call_arm64:
stp x29, x30, [sp, #-16]!
mov x29, sp
;; TODO - this is really weird using half an XMM, this makes the arm assembly
;; more difficult - this probably isn't required for arm?
;; grab the address to call and put it in xmm0
;; TODO - this stack pointer manipulation might be a problem for ARM64 which requires 16byte alignment
;; sub sp, 8
ldr q0, [sp, #+16]
;; grab the stack offset
ldr x0, [sp, #+8]
;; first, save quadword registers
stp q15, q14, [sp, #-32]!
stp q13, q12, [sp, #-32]!
stp q11, q10, [sp, #-32]!
stp q9, q8, [sp, #-32]!
; NOTE - in x86 the 2 special registers are saved (R10 and R11)
; we don't need to do that in ARM64, there are plenty of registers to work with
;; oof
sub sp, sp, 1280
str x0, [sp, #+64] ; arg 0 (RDI in x86) and
str x1, [sp, #+80] ; arg 1 (RSI in x86)
str x2, [sp, #+96] ; arg 2 (RDX in x86) and arg 3 (RCX in x86)
str x3, [sp, #+112] ; arg 2 (RDX in x86) and arg 3 (RCX in x86)
str x4, [sp, #+128] ; arg 4 (R8 in x86) and arg 5 (R8 in x86)
str x5, [sp, #+144] ; arg 4 (R8 in x86) and arg 5 (R8 in x86)
str x6, [sp, #+160] ; arg 6 (R10 in x86) and arg 7 (R11 in x86)
str x7, [sp, #+176] ; arg 6 (R10 in x86) and arg 7 (R11 in x86)
str x20, [sp, #+352] ;; s6 (pp) (R13 in x86) and s7 (st) (R14 in x86)
str x21, [sp, #+368] ;; s6 (pp) (R13 in x86) and s7 (st) (R14 in x86)
mov x0, sp ; move the stack pointer to arg 0
sub x0, x0, x22 ; R15 is a "special" offset TODO - whats special about it?
str x0, [sp, #+464] ;; mip2c code's MIPS stack
mov x0, sp ;; move the stack pointer to the new position
sub sp, sp, x8 ;; allocate space on the stack for GOAL fake stack
stp x8, x8, [sp, #-16]! ;; and remember this so we can find our way back
;; TODO - this used to be a movq rax, xmm0
;; TODO - not sure why an `xmm` was used because that movq only uses the lower 64bits anyway
mov x0, v0.d[0] ; represents the lower 64 bits of q0
blr x8 ;; call!
;; unallocate
ldp x8, x8, [sp], #16
add sp, sp, x8
ldr x8, [sp, #+32]
add sp, sp, 1280 ; reset the stackpointer back
ldp q9, q8, [sp], #32
ldp q10, q11, [sp], #32
ldp q12, q13, [sp], #32
ldp q14, q15, [sp], #32
add sp, sp, 24 ;; 16 for the stuff pushed by trampoline
ldp x29, x30, [sp], #16
ret
;; The _call_goal_asm function is used to call a GOAL function from C.
;; It calls on the parent stack, which is a bad idea if your stack is not already a GOAL stack.
;; It supports up to 3 arguments and a return value.
;; This should be called with the arguments:
;; - first goal arg
;; - second goal arg
;; - third goal arg
;; - address of function to call
;; - address of the symbol table
;; - GOAL memory space offset
.global _call_goal_asm_arm64
.align 4
_call_goal_asm_arm64:
stp x29, x30, [sp, #-16]!
mov x29, sp
;; saved registers we need to modify for GOAL should be preserved
; ARM64 requires 16-byte stack pointer alignment
stp x20, x21, [sp, #-16]!
str x22, [sp, #-16]!
;; x0 - first arg
;; x1 - second arg
;; x2 - third arg
;; x3 - function pointer
;; x4 - st (goes in x20 and x21)
;; x5 - off (goes in x22)
;; set GOAL process
mov x20, x4
;; symbol table
mov x21, x4
;; offset
mov x22, x5
;; call GOAL by function pointer
blr x3
;; restore saved registers.
ldr x22, [sp], #16
ldp x20, x21, [sp], #16
ldp x29, x30, [sp], #16
ret
.global _call_goal8_asm_arm64
.align 4
_call_goal8_asm_arm64:
stp x29, x30, [sp, #-16]!
mov x29, sp
;; saved registers we need to modify for GOAL should be preserved
; ARM64 requires 16-byte stack pointer alignment
stp x20, x21, [sp, #-16]!
str x22, [sp, #-16]!
;; x0 - first arg (func)
;; x1 - second arg (arg array)
;; x2 - third arg (0)
;; x3 - pp (goes in r13)
;; x4 - st (goes in r14)
;; x5 - off (goes in r15)
;; set GOAL function pointer
mov x20, x3
;; st
mov x21, x4
;; offset
mov x22, x5
;; move function to temp
mov x8, x0
;; extract arguments
ldr x0, [x1] ;; 0
ldr x2, [x1, #+16] ;; 2
ldr x3, [x1, #+24] ;; 3
ldr x4, [x1, #+32] ;; 4
ldr x5, [x1, #+40] ;; 5
ldr x6, [x1, #+48] ;; 6
ldr x7, [x1, #+56] ;; 7
ldr x1, [x1, #+8] ;; 1 (do this last)
;; call GOAL by function pointer
blr x8
;; retore registers.
ldr x22, [sp], #16
ldp x20, x21, [sp], #16
ldp x29, x30, [sp], #16
ret
;; Call goal, but switch stacks.
.global _call_goal_on_stack_asm_arm64
.align 4
_call_goal_on_stack_asm_arm64:
stp x29, x30, [sp, #-16]!
mov x29, sp
;; x0 - stack pointer
;; x1 - unused
;; x2 - unused
;; x3 - function pointer
;; x4 - st (goes in x21 and x20)
;; x5 - offset (goes in x22)
;; saved registers we need to modify for GOAL should be preserved
; ARM64 requires 16-byte stack pointer alignment
stp x20, x21, [sp, #-16]!
;; also stash the current stack pointer on the stack
;; NOTE - you cannot directly store or load the `sp` register in arm64
mov x9, sp
stp x22, x9, [sp, #-16]!
;; switch to new stack
mov sp, x0
mov x20, x4 ;; set GOAL function pointer
mov x21, x4 ;; symbol table
mov x22, x5 ;; offset
;; call GOAL by function pointer
blr x3
;; restore registers
ldp x22, x9, [sp], #16
mov sp, x9
ldp x20, x21, [sp], #16
ldp x29, x30, [sp], #16
ret

View file

@ -1,7 +1,3 @@
;;;;;;;;;;;;;;;;;;;;
;; asm_funcs.nasm ;;
;;;;;;;;;;;;;;;;;;;;
;; GOAL Runtime assembly functions. These exist only in the x86 version of GOAL.
SECTION .text
@ -10,12 +6,11 @@ SECTION .text
global _arg_call_systemv
_arg_call_systemv:
pop rax
push r10
push r11
sub rsp, 8
push r10 ; arg 6 (OpenGOAL compiler expects this register to be saved but systemv doesn't save it)
push r11 ; arg 7 (OpenGOAL compiler expects this register to be saved but systemv doesn't save it)
; xmm stuff
sub rsp, 128
sub rsp, 136 ; 128 (size for xmms) + 8 (stack alignment)
movaps [rsp], xmm8
movaps [rsp + 16], xmm9
movaps [rsp + 32], xmm10
@ -35,25 +30,24 @@ _arg_call_systemv:
movaps xmm13, [rsp + 80]
movaps xmm14, [rsp + 96]
movaps xmm15, [rsp + 112]
add rsp, 128
add rsp, 136 ; 128 (size for xmms) + 8 (stack alignment)
add rsp, 8
pop r11
pop r10
ret
;; Call C++ code on unix systems, from GOAL. Pug arguments on the stack and put a pointer to this array in the first arg.
;; Call C++ code on unix systems, from GOAL.
;;
;; Put arguments on the stack and put a pointer to this array in the first arg.
;; this function pushes all 8 OpenGOAL registers into a stack array.
;; then it calls the function pointed to by rax with a pointer to this array.
;; it returns the return value of the called function.
global _stack_call_systemv
_stack_call_systemv:
pop rax
; align stack
sub rsp, 8
sub rsp, 128
sub rsp, 136 ; 128 (size for xmms) + 8 (stack alignment)
movaps [rsp], xmm8
movaps [rsp + 16], xmm9
movaps [rsp + 32], xmm10
@ -95,10 +89,8 @@ _stack_call_systemv:
movaps xmm13, [rsp + 80]
movaps xmm14, [rsp + 96]
movaps xmm15, [rsp + 112]
add rsp, 128
; restore stack
add rsp, 8
add rsp, 136 ; 128 (size for xmms) + 8 (stack alignment)
; return!
ret
@ -149,14 +141,14 @@ _mips2c_call_systemv:
sub rsp, rax ;; allocate space on the stack for GOAL fake stack
push rax ;; and remember this so we can find our way back
push rax
sub rsp, 8
movq rax, xmm0
call rax ;; call!
;; unallocate
pop rax
add rsp, 8
pop rax
add rsp, rax
@ -347,7 +339,7 @@ _call_goal_asm_systemv:
;; call GOAL by function pointer
call rcx
;; retore x86 registers.
;; restore x86 registers.
pop r15
pop r14
pop r13

View file

@ -264,11 +264,19 @@ u64 make_string_from_c(const char* c_str) {
}
extern "C" {
#ifndef __aarch64__
#ifdef __APPLE__
void _arg_call_systemv() asm("_arg_call_systemv");
#else
void _arg_call_systemv();
#endif
#else
#ifdef __APPLE__
void _arg_call_arm64() asm("_arg_call_arm64");
#else
void _arg_call_arm64();
#endif
#endif
}
/*!
@ -280,8 +288,13 @@ Ptr<Function> make_function_from_c_systemv(void* func, bool arg3_is_pp) {
*(s7 + FIX_SYM_FUNCTION_TYPE), 0x40, UNKNOWN_PP));
auto f = (uint64_t)func;
auto target_function = (u8*)&f;
#ifndef __aarch64__
auto trampoline_function_addr = _arg_call_systemv;
#else
auto trampoline_function_addr = _arg_call_arm64;
#endif
auto trampoline = (u8*)&trampoline_function_addr;
// TODO - x86 code still being emitted below
// movabs rax, target_function
int offset = 0;
@ -382,13 +395,25 @@ Ptr<Function> make_function_from_c_win32(void* func, bool arg3_is_pp) {
}
extern "C" {
#ifndef __aarch64__
#ifdef __APPLE__
void _arg_call_systemv() asm("_arg_call_systemv");
void _stack_call_systemv() asm("_stack_call_systemv");
void _stack_call_win32() asm("_stack_call_win32");
#else
void _arg_call_systemv();
void _stack_call_systemv();
void _stack_call_win32();
#endif
#else
#if defined(__APPLE__)
void _arg_call_arm64() asm("_arg_call_arm64");
void _stack_call_arm64() asm("_stack_call_arm64");
#else
void _arg_call_arm64();
void _stack_call_arm64();
#endif
#endif
}
Ptr<Function> make_stack_arg_function_from_c_systemv(void* func) {
@ -397,7 +422,11 @@ Ptr<Function> make_stack_arg_function_from_c_systemv(void* func) {
*(s7 + FIX_SYM_FUNCTION_TYPE), 0x40, UNKNOWN_PP));
auto f = (uint64_t)func;
auto target_function = (u8*)&f;
#ifndef __aarch64__
auto trampoline_function_addr = _stack_call_systemv;
#else
auto trampoline_function_addr = _stack_call_arm64;
#endif
auto trampoline = (u8*)&trampoline_function_addr;
// movabs rax, target_function
@ -427,6 +456,7 @@ Ptr<Function> make_stack_arg_function_from_c_systemv(void* func) {
return mem.cast<Function>();
}
#ifdef _WIN32
/*!
* Create a GOAL function from a C function. This calls a windows function, but doesn't scramble
* the argument order. It's supposed to be used with _format_win32 which assumes GOAL order.
@ -467,6 +497,7 @@ Ptr<Function> make_stack_arg_function_from_c_win32(void* func) {
return mem.cast<Function>();
}
#endif
/*!
* Create a GOAL function from a C function. This doesn't export it as a global function, it just

View file

@ -276,10 +276,24 @@ u64 make_debug_string_from_c(const char* c_str) {
}
extern "C" {
#ifndef __aarch64__
#ifdef __APPLE__
void _arg_call_systemv() asm("_arg_call_systemv");
void _stack_call_systemv() asm("_stack_call_systemv");
void _stack_call_win32() asm("_stack_call_win32");
#else
void _arg_call_systemv();
void _stack_call_systemv();
void _stack_call_win32();
#endif
#else
#if defined(__APPLE__)
void _arg_call_arm64() asm("_arg_call_arm64");
void _stack_call_arm64() asm("_stack_call_arm64");
#else
void _arg_call_arm64();
void _stack_call_arm64();
#endif
#endif
}
@ -292,8 +306,13 @@ Ptr<Function> make_function_from_c_systemv(void* func, bool arg3_is_pp) {
u32_in_fixed_sym(FIX_SYM_FUNCTION_TYPE), 0x40, UNKNOWN_PP));
auto f = (uint64_t)func;
auto target_function = (u8*)&f;
#ifndef __aarch64__
auto trampoline_function_addr = _arg_call_systemv;
#else
auto trampoline_function_addr = _arg_call_arm64;
#endif
auto trampoline = (u8*)&trampoline_function_addr;
// TODO - x86 code still being emitted below
// movabs rax, target_function
int offset = 0;
@ -393,23 +412,17 @@ Ptr<Function> make_function_from_c_win32(void* func, bool arg3_is_pp) {
return mem.cast<Function>();
}
extern "C" {
#ifdef __APPLE__
void _stack_call_systemv() asm("_stack_call_systemv");
void _stack_call_win32() asm("_stack_call_win32");
#else
void _stack_call_systemv();
void _stack_call_win32();
#endif
}
Ptr<Function> make_stack_arg_function_from_c_systemv(void* func) {
// allocate a function object on the global heap
auto mem = Ptr<u8>(alloc_heap_object(s7.offset + FIX_SYM_GLOBAL_HEAP,
u32_in_fixed_sym(FIX_SYM_FUNCTION_TYPE), 0x40, UNKNOWN_PP));
auto f = (uint64_t)func;
auto target_function = (u8*)&f;
#ifndef __aarch64__
auto trampoline_function_addr = _stack_call_systemv;
#else
auto trampoline_function_addr = _stack_call_arm64;
#endif
auto trampoline = (u8*)&trampoline_function_addr;
// movabs rax, target_function
@ -439,6 +452,7 @@ Ptr<Function> make_stack_arg_function_from_c_systemv(void* func) {
return mem.cast<Function>();
}
#ifdef _WIN32
/*!
* Create a GOAL function from a C function. This calls a windows function, but doesn't scramble
* the argument order. It's supposed to be used with _format_win32 which assumes GOAL order.
@ -479,6 +493,7 @@ Ptr<Function> make_stack_arg_function_from_c_win32(void* func) {
return mem.cast<Function>();
}
#endif
/*!
* Create a GOAL function from a C function. This doesn't export it as a global function, it just

View file

@ -60,6 +60,84 @@ enum X86_REG : s8 {
XMM15, // saved
};
// TODO - i think it'll be better to make some sort of abstraction
// mapping between x86 and arm, but just using this enum as a place to prototype
// the registers to use.
enum ARM64_REG : s8 {
X0, // arg 0, caller-saved RDI
X1, // arg 1, caller-saved RSI
X2, // arg 2, caller-saved RDX
X3, // arg 3, caller-saved RCX
X4, // arg 4, caller-saved R8
X5, // arg 5, caller-saved R9
X6, // arg 6, caller-saved R10
X7, // arg 7, caller-saved R11
X8, // return, temp, not saved (RAX)
X9, // temp, not-saved
X10, // temp, not-saved
X11, // temp, not-saved
X12, // temp, not-saved
X13, // temp, not-saved
X14, // temp, not-saved
X15, // temp, not-saved
X16, // temp, not-saved
X17, // temp, not-saved
X18, // temp, not-saved
x19, // saved TODO purpose?, R12
x20, // pp, R13
x21, // st, R14
x22, // offset, TODO purpose?, R15
X23, // unused, callee saved
X24, // unused, callee saved
X25, // unused, callee saved
X26, // unused, callee saved
X27, // unused, callee saved
X28, // unused, callee saved
X29, // callee saved, FP - don't use it
X30, // LR - don't use it
SP, // stack pointer
// quadword registers, equivalent to XMMs
// the convention in arm64 is the callee preserves all Q values
// at the same time though, the caller should not depend on this convention!
Q0,
Q1,
Q2,
Q3,
Q4,
Q5,
Q6,
Q7,
Q8,
Q9,
Q10,
Q11,
Q12,
Q13,
Q14,
Q15,
Q16,
Q17,
Q18,
Q19,
Q20,
Q21,
Q22,
Q23,
Q24,
Q25,
Q26,
Q27,
Q28,
Q29,
Q30,
Q31
};
class Register {
public:
Register() = default;

21
third-party/sse2neon/LICENSE generated vendored Normal file
View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2015-2022 SSE2NEON Contributors
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

9216
third-party/sse2neon/sse2neon.h generated vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -39,3 +39,6 @@ third-party/libtinyfiledialogs:
git: https://github.com/native-toolkit/libtinyfiledialogs/commit/cc6b593c029110af8045826ce691f540c85e850c
alternatives:
- https://github.com/btzy/nativefiledialog-extended (only file dialog support though!)
third-party/sse2neon:
git: https://github.com/DLTcollab/sse2neon/commit/2eede22be8c5922e44616260c5eab728e3c5e26f
license: MIT