diff --git a/SCsub b/SCsub
index f550872..2b2dfff 100644
--- a/SCsub
+++ b/SCsub
@@ -1,4 +1,5 @@
 import os
+import version
 
 Import('env')
 
@@ -8,7 +9,6 @@ if os.path.isdir('../mesh_data_resource'):
     module_env.Append(CPPDEFINES=['MESH_DATA_RESOURCE_PRESENT'])
 
 sources = [
-
     "register_types.cpp",
     "mesh_utils.cpp",
     "mesh_merger.cpp",
@@ -16,6 +16,9 @@ sources = [
     "xatlas/xatlas.cpp",
 ]
 
+if version.major < 4:
+    sources.append("delaunay/r128.c")
+
 if ARGUMENTS.get('custom_modules_shared', 'no') == 'yes':
     # Shared lib compilation
     module_env.Append(CCFLAGS=['-fPIC'])
diff --git a/delaunay/delaunay_3d.h b/delaunay/delaunay_3d.h
new file mode 100644
index 0000000..fd29323
--- /dev/null
+++ b/delaunay/delaunay_3d.h
@@ -0,0 +1,457 @@
+/*************************************************************************/
+/*  delaunay_3d.h                                                        */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2022 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2022 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef DELAUNAY_3D_H
+#define DELAUNAY_3D_H
+
+#include "core/math/aabb.h"
+#include "core/math/camera_matrix.h"
+#include "core/math/vector3.h"
+
+#include "r128.h"
+
+class Delaunay3D {
+	struct Simplex;
+
+	struct Vector3i {
+		int x;
+		int y;
+		int z;
+
+		Vector3i() {
+			x = 0;
+			y = 0;
+			z = 0;
+		}
+
+		Vector3i(const uint32_t p_x, const uint32_t p_y, const uint32_t p_z) {
+			x = p_x;
+			y = p_y;
+			z = p_z;
+		}
+
+		Vector3i(const Vector3i &other) {
+			x = other.x;
+			y = other.y;
+			z = other.z;
+		}
+
+		Vector3i(const Vector3 &v) {
+			x = static_cast<int>(v.x);
+			y = static_cast<int>(v.y);
+			z = static_cast<int>(v.z);
+		}
+
+		Vector3i &operator=(const Vector3i &other) {
+			x = other.x;
+			y = other.y;
+			z = other.z;
+
+			return *this;
+		}
+	};
+
+	enum {
+		ACCEL_GRID_SIZE = 16
+	};
+	struct GridPos {
+		Vector3i pos;
+		List<Simplex *>::Element *E = nullptr;
+	};
+
+	struct Simplex {
+		uint32_t points[4];
+		R128 circum_center_x;
+		R128 circum_center_y;
+		R128 circum_center_z;
+		R128 circum_r2;
+		LocalVector<GridPos> grid_positions;
+		List<Simplex *>::Element *SE = nullptr;
+
+		_FORCE_INLINE_ Simplex() {}
+		_FORCE_INLINE_ Simplex(uint32_t p_a, uint32_t p_b, uint32_t p_c, uint32_t p_d) {
+			points[0] = p_a;
+			points[1] = p_b;
+			points[2] = p_c;
+			points[3] = p_d;
+		}
+	};
+
+	struct Triangle {
+		uint32_t triangle[3];
+		bool bad = false;
+		_FORCE_INLINE_ bool operator==(const Triangle &p_triangle) const {
+			return triangle[0] == p_triangle.triangle[0] && triangle[1] == p_triangle.triangle[1] && triangle[2] == p_triangle.triangle[2];
+		}
+
+		_FORCE_INLINE_ Triangle() {}
+		_FORCE_INLINE_ Triangle(uint32_t p_a, uint32_t p_b, uint32_t p_c) {
+			if (p_a > p_b) {
+				SWAP(p_a, p_b);
+			}
+			if (p_b > p_c) {
+				SWAP(p_b, p_c);
+			}
+			if (p_a > p_b) {
+				SWAP(p_a, p_b);
+			}
+
+			triangle[0] = p_a;
+			triangle[1] = p_b;
+			triangle[2] = p_c;
+		}
+	};
+
+	struct TriangleHasher {
+		_FORCE_INLINE_ static uint32_t hash(const Triangle &p_triangle) {
+			uint32_t h = hash_djb2_one_32(p_triangle.triangle[0]);
+			h = hash_djb2_one_32(p_triangle.triangle[1], h);
+			return hash_djb2_one_32(p_triangle.triangle[2], h);
+		}
+	};
+
+	_FORCE_INLINE_ static void circum_sphere_compute(const Vector3 *p_points, Simplex *p_simplex) {
+		// the only part in the algorithm where there may be precision errors is this one, so ensure that
+		// we do it as maximum precision as possible
+
+		R128 v0_x = p_points[p_simplex->points[0]].x;
+		R128 v0_y = p_points[p_simplex->points[0]].y;
+		R128 v0_z = p_points[p_simplex->points[0]].z;
+		R128 v1_x = p_points[p_simplex->points[1]].x;
+		R128 v1_y = p_points[p_simplex->points[1]].y;
+		R128 v1_z = p_points[p_simplex->points[1]].z;
+		R128 v2_x = p_points[p_simplex->points[2]].x;
+		R128 v2_y = p_points[p_simplex->points[2]].y;
+		R128 v2_z = p_points[p_simplex->points[2]].z;
+		R128 v3_x = p_points[p_simplex->points[3]].x;
+		R128 v3_y = p_points[p_simplex->points[3]].y;
+		R128 v3_z = p_points[p_simplex->points[3]].z;
+
+		//Create the rows of our "unrolled" 3x3 matrix
+		R128 row1_x = v1_x - v0_x;
+		R128 row1_y = v1_y - v0_y;
+		R128 row1_z = v1_z - v0_z;
+
+		R128 row2_x = v2_x - v0_x;
+		R128 row2_y = v2_y - v0_y;
+		R128 row2_z = v2_z - v0_z;
+
+		R128 row3_x = v3_x - v0_x;
+		R128 row3_y = v3_y - v0_y;
+		R128 row3_z = v3_z - v0_z;
+
+		R128 sq_lenght1 = row1_x * row1_x + row1_y * row1_y + row1_z * row1_z;
+		R128 sq_lenght2 = row2_x * row2_x + row2_y * row2_y + row2_z * row2_z;
+		R128 sq_lenght3 = row3_x * row3_x + row3_y * row3_y + row3_z * row3_z;
+
+		//Compute the determinant of said matrix
+		R128 determinant = row1_x * (row2_y * row3_z - row3_y * row2_z) - row2_x * (row1_y * row3_z - row3_y * row1_z) + row3_x * (row1_y * row2_z - row2_y * row1_z);
+
+		// Compute the volume of the tetrahedron, and precompute a scalar quantity for re-use in the formula
+		R128 volume = determinant / R128(6.f);
+		R128 i12volume = R128(1.f) / (volume * R128(12.f));
+
+		R128 center_x = v0_x + i12volume * ((row2_y * row3_z - row3_y * row2_z) * sq_lenght1 - (row1_y * row3_z - row3_y * row1_z) * sq_lenght2 + (row1_y * row2_z - row2_y * row1_z) * sq_lenght3);
+		R128 center_y = v0_y + i12volume * (-(row2_x * row3_z - row3_x * row2_z) * sq_lenght1 + (row1_x * row3_z - row3_x * row1_z) * sq_lenght2 - (row1_x * row2_z - row2_x * row1_z) * sq_lenght3);
+		R128 center_z = v0_z + i12volume * ((row2_x * row3_y - row3_x * row2_y) * sq_lenght1 - (row1_x * row3_y - row3_x * row1_y) * sq_lenght2 + (row1_x * row2_y - row2_x * row1_y) * sq_lenght3);
+
+		//Once we know the center, the radius is clearly the distance to any vertex
+
+		R128 rel1_x = center_x - v0_x;
+		R128 rel1_y = center_y - v0_y;
+		R128 rel1_z = center_z - v0_z;
+
+		R128 radius1 = rel1_x * rel1_x + rel1_y * rel1_y + rel1_z * rel1_z;
+
+		p_simplex->circum_center_x = center_x;
+		p_simplex->circum_center_y = center_y;
+		p_simplex->circum_center_z = center_z;
+		p_simplex->circum_r2 = radius1;
+	}
+
+	_FORCE_INLINE_ static bool simplex_contains(const Vector3 *p_points, const Simplex &p_simplex, uint32_t p_vertex) {
+		R128 v_x = p_points[p_vertex].x;
+		R128 v_y = p_points[p_vertex].y;
+		R128 v_z = p_points[p_vertex].z;
+
+		R128 rel2_x = p_simplex.circum_center_x - v_x;
+		R128 rel2_y = p_simplex.circum_center_y - v_y;
+		R128 rel2_z = p_simplex.circum_center_z - v_z;
+
+		R128 radius2 = rel2_x * rel2_x + rel2_y * rel2_y + rel2_z * rel2_z;
+
+		return radius2 < (p_simplex.circum_r2 - R128(0.00001));
+	}
+
+	static bool simplex_is_coplanar(const Vector3 *p_points, const Simplex &p_simplex) {
+		Plane p(p_points[p_simplex.points[0]], p_points[p_simplex.points[1]], p_points[p_simplex.points[2]]);
+		if (ABS(p.distance_to(p_points[p_simplex.points[3]])) < CMP_EPSILON) {
+			return true;
+		}
+
+		CameraMatrix cm;
+
+		cm.matrix[0][0] = p_points[p_simplex.points[0]].x;
+		cm.matrix[0][1] = p_points[p_simplex.points[1]].x;
+		cm.matrix[0][2] = p_points[p_simplex.points[2]].x;
+		cm.matrix[0][3] = p_points[p_simplex.points[3]].x;
+
+		cm.matrix[1][0] = p_points[p_simplex.points[0]].y;
+		cm.matrix[1][1] = p_points[p_simplex.points[1]].y;
+		cm.matrix[1][2] = p_points[p_simplex.points[2]].y;
+		cm.matrix[1][3] = p_points[p_simplex.points[3]].y;
+
+		cm.matrix[2][0] = p_points[p_simplex.points[0]].z;
+		cm.matrix[2][1] = p_points[p_simplex.points[1]].z;
+		cm.matrix[2][2] = p_points[p_simplex.points[2]].z;
+		cm.matrix[2][3] = p_points[p_simplex.points[3]].z;
+
+		cm.matrix[3][0] = 1.0;
+		cm.matrix[3][1] = 1.0;
+		cm.matrix[3][2] = 1.0;
+		cm.matrix[3][3] = 1.0;
+
+		return ABS(camera_matrix_determinant(cm)) <= CMP_EPSILON;
+	}
+
+	static float camera_matrix_determinant(const CameraMatrix &m) {
+		return m.matrix[0][3] * m.matrix[1][2] * m.matrix[2][1] * m.matrix[3][0] - m.matrix[0][2] * m.matrix[1][3] * m.matrix[2][1] * m.matrix[3][0] -
+				m.matrix[0][3] * m.matrix[1][1] * m.matrix[2][2] * m.matrix[3][0] + m.matrix[0][1] * m.matrix[1][3] * m.matrix[2][2] * m.matrix[3][0] +
+				m.matrix[0][2] * m.matrix[1][1] * m.matrix[2][3] * m.matrix[3][0] - m.matrix[0][1] * m.matrix[1][2] * m.matrix[2][3] * m.matrix[3][0] -
+				m.matrix[0][3] * m.matrix[1][2] * m.matrix[2][0] * m.matrix[3][1] + m.matrix[0][2] * m.matrix[1][3] * m.matrix[2][0] * m.matrix[3][1] +
+				m.matrix[0][3] * m.matrix[1][0] * m.matrix[2][2] * m.matrix[3][1] - m.matrix[0][0] * m.matrix[1][3] * m.matrix[2][2] * m.matrix[3][1] -
+				m.matrix[0][2] * m.matrix[1][0] * m.matrix[2][3] * m.matrix[3][1] + m.matrix[0][0] * m.matrix[1][2] * m.matrix[2][3] * m.matrix[3][1] +
+				m.matrix[0][3] * m.matrix[1][1] * m.matrix[2][0] * m.matrix[3][2] - m.matrix[0][1] * m.matrix[1][3] * m.matrix[2][0] * m.matrix[3][2] -
+				m.matrix[0][3] * m.matrix[1][0] * m.matrix[2][1] * m.matrix[3][2] + m.matrix[0][0] * m.matrix[1][3] * m.matrix[2][1] * m.matrix[3][2] +
+				m.matrix[0][1] * m.matrix[1][0] * m.matrix[2][3] * m.matrix[3][2] - m.matrix[0][0] * m.matrix[1][1] * m.matrix[2][3] * m.matrix[3][2] -
+				m.matrix[0][2] * m.matrix[1][1] * m.matrix[2][0] * m.matrix[3][3] + m.matrix[0][1] * m.matrix[1][2] * m.matrix[2][0] * m.matrix[3][3] +
+				m.matrix[0][2] * m.matrix[1][0] * m.matrix[2][1] * m.matrix[3][3] - m.matrix[0][0] * m.matrix[1][2] * m.matrix[2][1] * m.matrix[3][3] -
+				m.matrix[0][1] * m.matrix[1][0] * m.matrix[2][2] * m.matrix[3][3] + m.matrix[0][0] * m.matrix[1][1] * m.matrix[2][2] * m.matrix[3][3];
+	}
+
+public:
+	struct OutputSimplex {
+		uint32_t points[4];
+	};
+
+	static Vector<OutputSimplex> tetrahedralize(const Vector<Vector3> &p_points) {
+		uint32_t point_count = p_points.size();
+		Vector3 *points = (Vector3 *)memalloc(sizeof(Vector3) * (point_count + 4));
+
+		{
+			const Vector3 *src_points = p_points.ptr();
+			AABB rect;
+			for (uint32_t i = 0; i < point_count; i++) {
+				Vector3 point = src_points[i];
+				if (i == 0) {
+					rect.position = point;
+				} else {
+					rect.expand_to(point);
+				}
+				points[i] = point;
+			}
+
+			for (uint32_t i = 0; i < point_count; i++) {
+				points[i] = (points[i] - rect.position) / rect.size;
+			}
+
+			float delta_max = Math::sqrt(2.0) * 20.0;
+			Vector3 center = Vector3(0.5, 0.5, 0.5);
+
+			// any simplex that contains everything is good
+			points[point_count + 0] = center + Vector3(0, 1, 0) * delta_max;
+			points[point_count + 1] = center + Vector3(0, -1, 1) * delta_max;
+			points[point_count + 2] = center + Vector3(1, -1, -1) * delta_max;
+			points[point_count + 3] = center + Vector3(-1, -1, -1) * delta_max;
+		}
+
+		List<Simplex *> acceleration_grid[ACCEL_GRID_SIZE][ACCEL_GRID_SIZE][ACCEL_GRID_SIZE];
+
+		List<Simplex *> simplex_list;
+		{
+			//create root simplex
+			Simplex *root = memnew(Simplex(point_count + 0, point_count + 1, point_count + 2, point_count + 3));
+			root->SE = simplex_list.push_back(root);
+
+			for (uint32_t i = 0; i < ACCEL_GRID_SIZE; i++) {
+				for (uint32_t j = 0; j < ACCEL_GRID_SIZE; j++) {
+					for (uint32_t k = 0; k < ACCEL_GRID_SIZE; k++) {
+						GridPos gp;
+						gp.E = acceleration_grid[i][j][k].push_back(root);
+						gp.pos = Vector3i(i, j, k);
+						root->grid_positions.push_back(gp);
+					}
+				}
+			}
+
+			circum_sphere_compute(points, root);
+		}
+
+		HashMap<Triangle, uint32_t, TriangleHasher> triangles_inserted;
+		LocalVector<Triangle> triangles;
+
+		for (uint32_t i = 0; i < point_count; i++) {
+			bool unique = true;
+			for (uint32_t j = i + 1; j < point_count; j++) {
+				if (points[i].is_equal_approx(points[j])) {
+					unique = false;
+					break;
+				}
+			}
+			if (!unique) {
+				continue;
+			}
+
+			Vector3i grid_pos = Vector3i(points[i] * ACCEL_GRID_SIZE);
+			grid_pos.x = CLAMP(grid_pos.x, 0, ACCEL_GRID_SIZE - 1);
+			grid_pos.y = CLAMP(grid_pos.y, 0, ACCEL_GRID_SIZE - 1);
+			grid_pos.z = CLAMP(grid_pos.z, 0, ACCEL_GRID_SIZE - 1);
+
+			for (List<Simplex *>::Element *E = acceleration_grid[grid_pos.x][grid_pos.y][grid_pos.z].front(); E;) {
+				List<Simplex *>::Element *N = E->next(); //may be deleted
+
+				Simplex *simplex = E->get();
+
+				if (simplex_contains(points, *simplex, i)) {
+					static const uint32_t triangle_order[4][3] = {
+						{ 0, 1, 2 },
+						{ 0, 1, 3 },
+						{ 0, 2, 3 },
+						{ 1, 2, 3 },
+					};
+
+					for (uint32_t k = 0; k < 4; k++) {
+						Triangle t = Triangle(simplex->points[triangle_order[k][0]], simplex->points[triangle_order[k][1]], simplex->points[triangle_order[k][2]]);
+						uint32_t *p = triangles_inserted.getptr(t);
+						if (p) {
+							triangles[*p].bad = true;
+						} else {
+							triangles_inserted.set(t, triangles.size());
+							triangles.push_back(t);
+						}
+					}
+
+					//remove simplex and continue
+					simplex_list.erase(simplex->SE);
+
+					for (uint32_t k = 0; k < simplex->grid_positions.size(); k++) {
+						Vector3i p = simplex->grid_positions[k].pos;
+						acceleration_grid[p.x][p.y][p.z].erase(simplex->grid_positions[k].E);
+					}
+					memdelete(simplex);
+				}
+				E = N;
+			}
+
+			uint32_t good_triangles = 0;
+			for (uint32_t j = 0; j < triangles.size(); j++) {
+				if (triangles[j].bad) {
+					continue;
+				}
+				Simplex *new_simplex = memnew(Simplex(triangles[j].triangle[0], triangles[j].triangle[1], triangles[j].triangle[2], i));
+				circum_sphere_compute(points, new_simplex);
+				new_simplex->SE = simplex_list.push_back(new_simplex);
+				{
+					Vector3 center;
+					center.x = double(new_simplex->circum_center_x);
+					center.y = double(new_simplex->circum_center_y);
+					center.z = double(new_simplex->circum_center_z);
+
+					float radius2 = Math::sqrt(double(new_simplex->circum_r2));
+					radius2 += 0.0001; //
+					Vector3 extents = Vector3(radius2, radius2, radius2);
+					Vector3i from = Vector3i((center - extents) * ACCEL_GRID_SIZE);
+					Vector3i to = Vector3i((center + extents) * ACCEL_GRID_SIZE);
+					from.x = CLAMP(from.x, 0, ACCEL_GRID_SIZE - 1);
+					from.y = CLAMP(from.y, 0, ACCEL_GRID_SIZE - 1);
+					from.z = CLAMP(from.z, 0, ACCEL_GRID_SIZE - 1);
+					to.x = CLAMP(to.x, 0, ACCEL_GRID_SIZE - 1);
+					to.y = CLAMP(to.y, 0, ACCEL_GRID_SIZE - 1);
+					to.z = CLAMP(to.z, 0, ACCEL_GRID_SIZE - 1);
+
+					for (int32_t x = from.x; x <= to.x; x++) {
+						for (int32_t y = from.y; y <= to.y; y++) {
+							for (int32_t z = from.z; z <= to.z; z++) {
+								GridPos gp;
+								gp.pos = Vector3(x, y, z);
+								gp.E = acceleration_grid[x][y][z].push_back(new_simplex);
+								new_simplex->grid_positions.push_back(gp);
+							}
+						}
+					}
+				}
+
+				good_triangles++;
+			}
+
+			//print_line("at point " + itos(i) + "/" + itos(point_count) + " simplices added " + itos(good_triangles) + "/" + itos(simplex_list.size()) + " - triangles: " + itos(triangles.size()));
+			triangles.clear();
+			triangles_inserted.clear();
+		}
+
+		//print_line("end with simplices: " + itos(simplex_list.size()));
+		Vector<OutputSimplex> ret_simplices;
+		ret_simplices.resize(simplex_list.size());
+		OutputSimplex *ret_simplicesw = ret_simplices.ptrw();
+		uint32_t simplices_written = 0;
+
+		//List<Simplex *> simplex_list;
+		//for (Simplex *simplex : simplex_list) {
+		for (List<Simplex *>::Element *E = simplex_list.front(); E; E = E->next()) {
+			Simplex *simplex = E->get();
+			bool invalid = false;
+			for (int j = 0; j < 4; j++) {
+				if (simplex->points[j] >= point_count) {
+					invalid = true;
+					break;
+				}
+			}
+			if (invalid || simplex_is_coplanar(points, *simplex)) {
+				memdelete(simplex);
+				continue;
+			}
+
+			ret_simplicesw[simplices_written].points[0] = simplex->points[0];
+			ret_simplicesw[simplices_written].points[1] = simplex->points[1];
+			ret_simplicesw[simplices_written].points[2] = simplex->points[2];
+			ret_simplicesw[simplices_written].points[3] = simplex->points[3];
+			simplices_written++;
+			memdelete(simplex);
+		}
+
+		ret_simplices.resize(simplices_written);
+
+		memfree(points);
+
+		return ret_simplices;
+	}
+};
+
+#endif // DELAUNAY_3D_H
diff --git a/delaunay/r128.c b/delaunay/r128.c
new file mode 100644
index 0000000..6b981aa
--- /dev/null
+++ b/delaunay/r128.c
@@ -0,0 +1,2 @@
+#define R128_IMPLEMENTATION
+#include "r128.h"
diff --git a/delaunay/r128.h b/delaunay/r128.h
new file mode 100644
index 0000000..a345cc4
--- /dev/null
+++ b/delaunay/r128.h
@@ -0,0 +1,2123 @@
+/*
+r128.h: 128-bit (64.64) signed fixed-point arithmetic. Version 1.4.4
+
+COMPILATION
+-----------
+Drop this header file somewhere in your project and include it wherever it is
+needed. There is no separate .c file for this library. To get the code, in ONE
+file in your project, put:
+
+#define R128_IMPLEMENTATION
+
+before you include this file. You may also provide a definition for R128_ASSERT
+to force the library to use a custom assert macro.
+
+COMPILER/LIBRARY SUPPORT
+------------------------
+This library requires a C89 compiler with support for 64-bit integers. If your
+compiler does not support the long long data type, the R128_U64, etc. macros
+must be set appropriately. On x86 and x64 targets, Intel intrinsics are used
+for speed. If your compiler does not support these intrinsics, you can add
+#define R128_STDC_ONLY
+in your implementation file before including r128.h.
+
+The only C runtime library functionality used by this library is <assert.h>.
+This can be avoided by defining an R128_ASSERT macro in your implementation
+file. Since this library uses 64-bit arithmetic, this may implicitly add a
+runtime library dependency on 32-bit platforms.
+
+C++ SUPPORT
+-----------
+Operator overloads are supplied for C++ files that include this file. Since all
+C++ functions are declared inline (or static inline), the R128_IMPLEMENTATION
+file can be either C++ or C.
+
+LICENSE
+-------
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#ifndef H_R128_H
+#define H_R128_H
+
+#include <stddef.h>
+
+// 64-bit integer support
+// If your compiler does not have stdint.h, add appropriate defines for these macros.
+#if defined(_MSC_VER) && (_MSC_VER < 1600)
+#  define R128_S32 __int32
+#  define R128_U32 unsigned __int32
+#  define R128_S64 __int64
+#  define R128_U64 unsigned __int64
+#  define R128_LIT_S64(x) x##i64
+#  define R128_LIT_U64(x) x##ui64
+#else
+#  include <stdint.h>
+#  define R128_S32 int32_t
+#  define R128_U32 uint32_t
+#  define R128_S64 long long
+#  define R128_U64 unsigned long long
+#  define R128_LIT_S64(x) x##ll
+#  define R128_LIT_U64(x) x##ull
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct R128 {
+   R128_U64 lo;
+   R128_U64 hi;
+
+#ifdef __cplusplus
+   R128();
+   R128(double);
+   R128(int);
+   R128(R128_S64);
+   R128(R128_U64 low, R128_U64 high);
+
+   operator double() const;
+   operator R128_S64() const;
+   operator int() const;
+   operator bool() const;
+
+   bool operator!() const;
+   R128 operator~() const;
+   R128 operator-() const;
+   R128 &operator|=(const R128 &rhs);
+   R128 &operator&=(const R128 &rhs);
+   R128 &operator^=(const R128 &rhs);
+   R128 &operator+=(const R128 &rhs);
+   R128 &operator-=(const R128 &rhs);
+   R128 &operator*=(const R128 &rhs);
+   R128 &operator/=(const R128 &rhs);
+   R128 &operator%=(const R128 &rhs);
+   R128 &operator<<=(int amount);
+   R128 &operator>>=(int amount);
+#endif   //__cplusplus
+} R128;
+
+// Type conversion
+extern void r128FromInt(R128 *dst, R128_S64 v);
+extern void r128FromFloat(R128 *dst, double v);
+extern R128_S64 r128ToInt(const R128 *v);
+extern double r128ToFloat(const R128 *v);
+
+// Copy
+extern void r128Copy(R128 *dst, const R128 *src);
+
+// Negate
+extern void r128Neg(R128 *dst, const R128 *src);
+
+// Bitwise operations
+extern void r128Not(R128 *dst, const R128 *src);               // ~a
+extern void r128Or(R128 *dst, const R128 *a, const R128 *b);   // a | b
+extern void r128And(R128 *dst, const R128 *a, const R128 *b);  // a & b
+extern void r128Xor(R128 *dst, const R128 *a, const R128 *b);  // a ^ b
+extern void r128Shl(R128 *dst, const R128 *src, int amount);   // shift left by amount mod 128
+extern void r128Shr(R128 *dst, const R128 *src, int amount);   // shift right logical by amount mod 128
+extern void r128Sar(R128 *dst, const R128 *src, int amount);   // shift right arithmetic by amount mod 128
+
+// Arithmetic
+extern void r128Add(R128 *dst, const R128 *a, const R128 *b);  // a + b
+extern void r128Sub(R128 *dst, const R128 *a, const R128 *b);  // a - b
+extern void r128Mul(R128 *dst, const R128 *a, const R128 *b);  // a * b
+extern void r128Div(R128 *dst, const R128 *a, const R128 *b);  // a / b
+extern void r128Mod(R128 *dst, const R128 *a, const R128 *b);  // a - toInt(a / b) * b
+
+extern void r128Sqrt(R128 *dst, const R128 *v);  // sqrt(v)
+extern void r128Rsqrt(R128 *dst, const R128 *v); // 1 / sqrt(v)
+
+// Comparison
+extern int  r128Cmp(const R128 *a, const R128 *b);  // sign of a-b
+extern void r128Min(R128 *dst, const R128 *a, const R128 *b);
+extern void r128Max(R128 *dst, const R128 *a, const R128 *b);
+extern void r128Floor(R128 *dst, const R128 *v);
+extern void r128Ceil(R128 *dst, const R128 *v);
+extern int  r128IsNeg(const R128 *v); // quick check for < 0
+
+// String conversion
+//
+typedef enum R128ToStringSign {
+   R128ToStringSign_Default,  // no sign character for positive values
+   R128ToStringSign_Space,    // leading space for positive values
+   R128ToStringSign_Plus,     // leading '+' for positive values
+} R128ToStringSign;
+
+// Formatting options for use with r128ToStringOpt. The "defaults" correspond
+// to a format string of "%f".
+//
+typedef struct R128ToStringFormat {
+   // sign character for positive values. Default is R128ToStringSign_Default.
+   R128ToStringSign sign;
+
+   // minimum number of characters to write. Default is 0.
+   int width;
+
+   // place to the right of the decimal at which rounding is performed. If negative,
+   // a maximum of 20 decimal places will be written, with no trailing zeroes.
+   // (20 places is sufficient to ensure that r128FromString will convert back to the
+   // original value.) Default is -1. NOTE: This is not the same default that the C
+   // standard library uses for %f.
+   int precision;
+
+   // If non-zero, pads the output string with leading zeroes if the final result is
+   // fewer than width characters. Otherwise, leading spaces are used. Default is 0.
+   int zeroPad;
+
+   // Always print a decimal point, even if the value is an integer. Default is 0.
+   int decimal;
+
+   // Left-align output if width specifier requires padding.
+   // Default is 0 (right align).
+   int leftAlign;
+} R128ToStringFormat;
+
+// r128ToStringOpt: convert R128 to a decimal string, with formatting.
+//
+// dst and dstSize: specify the buffer to write into. At most dstSize bytes will be written
+// (including null terminator). No additional rounding is performed if dstSize is not large
+// enough to hold the entire string.
+//
+// opt: an R128ToStringFormat struct (q.v.) with formatting options.
+//
+// Uses the R128_decimal global as the decimal point character.
+// Always writes a null terminator, even if the destination buffer is not large enough.
+//
+// Number of bytes that will be written (i.e. how big does dst need to be?):
+// If width is specified: width + 1 bytes.
+// If precision is specified: at most precision + 22 bytes.
+// If neither is specified: at most 42 bytes.
+//
+// Returns the number of bytes that would have been written if dst was sufficiently large,
+// not including the final null terminator.
+//
+extern int r128ToStringOpt(char *dst, size_t dstSize, const R128 *v, const R128ToStringFormat *opt);
+
+// r128ToStringf: convert R128 to a decimal string, with formatting.
+//
+// dst and dstSize: specify the buffer to write into. At most dstSize bytes will be written
+// (including null terminator).
+//
+// format: a printf-style format specifier, as one would use with floating point types.
+//    e.g. "%+5.2f". (The leading % and trailing f are optional.)
+//    NOTE: This is NOT a full replacement for sprintf. Any characters in the format string
+//       that do not correspond to a format placeholder are ignored.
+//
+// Uses the R128_decimal global as the decimal point character.
+// Always writes a null terminator, even if the destination buffer is not large enough.
+//
+// Number of bytes that will be written (i.e. how big does dst need to be?):
+// If the precision field is specified: at most max(width, precision + 21) + 1 bytes
+// Otherwise: at most max(width, 41) + 1 bytes.
+//
+// Returns the number of bytes that would have been written if dst was sufficiently large,
+// not including the final null terminator.
+//
+extern int r128ToStringf(char *dst, size_t dstSize, const char *format, const R128 *v);
+
+// r128ToString: convert R128 to a decimal string, with default formatting.
+// Equivalent to r128ToStringf(dst, dstSize, "%f", v).
+//
+// Uses the R128_decimal global as the decimal point character.
+// Always writes a null terminator, even if the destination buffer is not large enough.
+//
+// Will write at most 42 bytes (including NUL) to dst.
+//
+// Returns the number of bytes that would have been written if dst was sufficiently large,
+// not including the final null terminator.
+//
+extern int r128ToString(char *dst, size_t dstSize, const R128 *v);
+
+// r128FromString: Convert string to R128.
+//
+// The string can be formatted either as a decimal number with optional sign
+// or as hexadecimal with a prefix of 0x or 0X.
+//
+// endptr, if not NULL, is set to the character following the last character
+//   used in the conversion.
+//
+extern void r128FromString(R128 *dst, const char *s, char **endptr);
+
+// Constants
+extern const R128 R128_min;      // minimum (most negative) value
+extern const R128 R128_max;      // maximum (most positive) value
+extern const R128 R128_smallest; // smallest positive value
+extern const R128 R128_zero;     // zero
+extern const R128 R128_one;      // 1.0
+
+extern char R128_decimal;        // decimal point character used by r128From/ToString. defaults to '.'
+
+#ifdef __cplusplus
+}
+
+#include <limits>
+namespace std {
+template<>
+struct numeric_limits<R128>
+{
+   static const bool is_specialized = true;
+
+   static R128 min() throw() { return R128_min; }
+   static R128 max() throw() { return R128_max; }
+
+   static const int digits = 127;
+   static const int digits10 = 38;
+   static const bool is_signed = true;
+   static const bool is_integer = false;
+   static const bool is_exact = false;
+   static const int radix = 2;
+   static R128 epsilon() throw() { return R128_smallest; }
+   static R128 round_error() throw() { return R128_one; }
+
+   static const int min_exponent = 0;
+   static const int min_exponent10 = 0;
+   static const int max_exponent = 0;
+   static const int max_exponent10 = 0;
+
+   static const bool has_infinity = false;
+   static const bool has_quiet_NaN = false;
+   static const bool has_signaling_NaN = false;
+   static const float_denorm_style has_denorm = denorm_absent;
+   static const bool has_denorm_loss = false;
+
+   static R128 infinity() throw() { return R128_zero; }
+   static R128 quiet_NaN() throw() { return R128_zero; }
+   static R128 signaling_NaN() throw() { return R128_zero; }
+   static R128 denorm_min() throw() { return R128_zero; }
+
+   static const bool is_iec559 = false;
+   static const bool is_bounded = true;
+   static const bool is_modulo = true;
+
+   static const bool traps = numeric_limits<R128_U64>::traps;
+   static const bool tinyness_before = false;
+   static const float_round_style round_style = round_toward_zero;
+};
+}  //namespace std
+
+inline R128::R128() {}
+
+inline R128::R128(double v)
+{
+   r128FromFloat(this, v);
+}
+
+inline R128::R128(int v)
+{
+   r128FromInt(this, v);
+}
+
+inline R128::R128(R128_S64 v)
+{
+   r128FromInt(this, v);
+}
+
+inline R128::R128(R128_U64 low, R128_U64 high)
+{
+   lo = low;
+   hi = high;
+}
+
+inline R128::operator double() const
+{
+   return r128ToFloat(this);
+}
+
+inline R128::operator R128_S64() const
+{
+   return r128ToInt(this);
+}
+
+inline R128::operator int() const
+{
+   return (int) r128ToInt(this);
+}
+
+inline R128::operator bool() const
+{
+   return lo || hi;
+}
+
+inline bool R128::operator!() const
+{
+   return !lo && !hi;
+}
+
+inline R128 R128::operator~() const
+{
+   R128 r;
+   r128Not(&r, this);
+   return r;
+}
+
+inline R128 R128::operator-() const
+{
+   R128 r;
+   r128Neg(&r, this);
+   return r;
+}
+
+inline R128 &R128::operator|=(const R128 &rhs)
+{
+   r128Or(this, this, &rhs);
+   return *this;
+}
+
+inline R128 &R128::operator&=(const R128 &rhs)
+{
+   r128And(this, this, &rhs);
+   return *this;
+}
+
+inline R128 &R128::operator^=(const R128 &rhs)
+{
+   r128Xor(this, this, &rhs);
+   return *this;
+}
+
+inline R128 &R128::operator+=(const R128 &rhs)
+{
+   r128Add(this, this, &rhs);
+   return *this;
+}
+
+inline R128 &R128::operator-=(const R128 &rhs)
+{
+   r128Sub(this, this, &rhs);
+   return *this;
+}
+
+inline R128 &R128::operator*=(const R128 &rhs)
+{
+   r128Mul(this, this, &rhs);
+   return *this;
+}
+
+inline R128 &R128::operator/=(const R128 &rhs)
+{
+   r128Div(this, this, &rhs);
+   return *this;
+}
+
+inline R128 &R128::operator%=(const R128 &rhs)
+{
+   r128Mod(this, this, &rhs);
+   return *this;
+}
+
+inline R128 &R128::operator<<=(int amount)
+{
+   r128Shl(this, this, amount);
+   return *this;
+}
+
+inline R128 &R128::operator>>=(int amount)
+{
+   r128Sar(this, this, amount);
+   return *this;
+}
+
+static inline R128 operator|(const R128 &lhs, const R128 &rhs)
+{
+   R128 r(lhs);
+   return r |= rhs;
+}
+
+static inline R128 operator&(const R128 &lhs, const R128 &rhs)
+{
+   R128 r(lhs);
+   return r &= rhs;
+}
+
+static inline R128 operator^(const R128 &lhs, const R128 &rhs)
+{
+   R128 r(lhs);
+   return r ^= rhs;
+}
+
+static inline R128 operator+(const R128 &lhs, const R128 &rhs)
+{
+   R128 r(lhs);
+   return r += rhs;
+}
+
+static inline R128 operator-(const R128 &lhs, const R128 &rhs)
+{
+   R128 r(lhs);
+   return r -= rhs;
+}
+
+static inline R128 operator*(const R128 &lhs, const R128 &rhs)
+{
+   R128 r(lhs);
+   return r *= rhs;
+}
+
+static inline R128 operator/(const R128 &lhs, const R128 &rhs)
+{
+   R128 r(lhs);
+   return r /= rhs;
+}
+
+static inline R128 operator%(const R128 &lhs, const R128 &rhs)
+{
+   R128 r(lhs);
+   return r %= rhs;
+}
+
+static inline R128 operator<<(const R128 &lhs, int amount)
+{
+   R128 r(lhs);
+   return r <<= amount;
+}
+
+static inline R128 operator>>(const R128 &lhs, int amount)
+{
+   R128 r(lhs);
+   return r >>= amount;
+}
+
+static inline bool operator<(const R128 &lhs, const R128 &rhs)
+{
+   return r128Cmp(&lhs, &rhs) < 0;
+}
+
+static inline bool operator>(const R128 &lhs, const R128 &rhs)
+{
+   return r128Cmp(&lhs, &rhs) > 0;
+}
+
+static inline bool operator<=(const R128 &lhs, const R128 &rhs)
+{
+   return r128Cmp(&lhs, &rhs) <= 0;
+}
+
+static inline bool operator>=(const R128 &lhs, const R128 &rhs)
+{
+   return r128Cmp(&lhs, &rhs) >= 0;
+}
+
+static inline bool operator==(const R128 &lhs, const R128 &rhs)
+{
+   return lhs.lo == rhs.lo && lhs.hi == rhs.hi;
+}
+
+static inline bool operator!=(const R128 &lhs, const R128 &rhs)
+{
+   return lhs.lo != rhs.lo || lhs.hi != rhs.hi;
+}
+
+#endif   //__cplusplus
+#endif   //H_R128_H
+
+#ifdef R128_IMPLEMENTATION
+
+#ifdef R128_DEBUG_VIS
+#  define R128_DEBUG_SET(x)   r128ToString(R128_last, sizeof(R128_last), x)
+#else
+#  define R128_DEBUG_SET(x)
+#endif
+
+#define R128_SET2(x, l, h) do { (x)->lo = (R128_U64)(l); (x)->hi = (R128_U64)(h); } while(0)
+#define R128_R0(x) ((R128_U32)(x)->lo)
+#define R128_R2(x) ((R128_U32)(x)->hi)
+#if defined(_M_IX86)
+// workaround: MSVC x86's handling of 64-bit values is not great
+#  define R128_SET4(x, r0, r1, r2, r3) do { \
+      ((R128_U32*)&(x)->lo)[0] = (R128_U32)(r0); \
+      ((R128_U32*)&(x)->lo)[1] = (R128_U32)(r1); \
+      ((R128_U32*)&(x)->hi)[0] = (R128_U32)(r2); \
+      ((R128_U32*)&(x)->hi)[1] = (R128_U32)(r3); \
+      } while(0)
+#  define R128_R1(x) (((R128_U32*)&(x)->lo)[1])
+#  define R128_R3(x) (((R128_U32*)&(x)->hi)[1])
+#else
+#  define R128_SET4(x, r0, r1, r2, r3) do { (x)->lo = (R128_U64)(r0) | ((R128_U64)(r1) << 32); \
+      (x)->hi = (R128_U64)(r2) | ((R128_U64)(r3) << 32); } while(0)
+#  define R128_R1(x) ((R128_U32)((x)->lo >> 32))
+#  define R128_R3(x) ((R128_U32)((x)->hi >> 32))
+#endif
+
+#if defined(_M_X64)
+#  define R128_INTEL 1
+#  define R128_64BIT 1
+#  ifndef R128_STDC_ONLY
+#     include <intrin.h>
+#  endif
+#elif defined(__x86_64__)
+#  define R128_INTEL 1
+#  define R128_64BIT 1
+#  ifndef R128_STDC_ONLY
+#     include <x86intrin.h>
+#  endif
+#elif defined(_M_IX86)
+#  define R128_INTEL 1
+#  ifndef R128_STDC_ONLY
+#     include <intrin.h>
+#  endif
+#elif defined(__i386__)
+#  define R128_INTEL 1
+#  ifndef R128_STDC_ONLY
+#     include <x86intrin.h>
+#  endif
+#elif defined(_M_ARM)
+#  ifndef R128_STDC_ONLY
+#     include <intrin.h>
+#  endif
+#elif defined(_M_ARM64)
+#  define R128_64BIT 1
+#  ifndef R128_STDC_ONLY
+#     include <intrin.h>
+#  endif
+#elif defined(__aarch64__)
+#  define R128_64BIT 1
+#endif
+
+#ifndef R128_INTEL
+#  define R128_INTEL 0
+#endif
+
+#ifndef R128_64BIT
+#  define R128_64BIT 0
+#endif
+
+#ifndef R128_ASSERT
+#  include <assert.h>
+#  define R128_ASSERT(x) assert(x)
+#endif
+
+#include <stdlib.h>  // for NULL
+
+static const R128ToStringFormat R128__defaultFormat = {
+   R128ToStringSign_Default,
+   0,
+   -1,
+   0,
+   0,
+   0
+};
+
+const R128 R128_min = { 0, R128_LIT_U64(0x8000000000000000) };
+const R128 R128_max = { R128_LIT_U64(0xffffffffffffffff), R128_LIT_U64(0x7fffffffffffffff) };
+const R128 R128_smallest = { 1, 0 };
+const R128 R128_zero = { 0, 0 };
+const R128 R128_one = { 0, 1 };
+char R128_decimal = '.';
+#ifdef R128_DEBUG_VIS
+char R128_last[42];
+#endif
+
+static int r128__clz64(R128_U64 x)
+{
+#if defined(R128_STDC_ONLY)
+   R128_U64 n = 64, y;
+   y = x >> 32; if (y) { n -= 32; x = y; }
+   y = x >> 16; if (y) { n -= 16; x = y; }
+   y = x >>  8; if (y) { n -=  8; x = y; }
+   y = x >>  4; if (y) { n -=  4; x = y; }
+   y = x >>  2; if (y) { n -=  2; x = y; }
+   y = x >>  1; if (y) { n -=  1; x = y; }
+   return (int)(n - x);
+#elif defined(_M_X64) || defined(_M_ARM64)
+   unsigned long idx;
+   if (_BitScanReverse64(&idx, x)) {
+      return 63 - (int)idx;
+   } else {
+      return 64;
+   }
+#elif defined(_MSC_VER)
+   unsigned long idx;
+   if (_BitScanReverse(&idx, (R128_U32)(x >> 32))) {
+      return 31 - (int)idx;
+   } else if (_BitScanReverse(&idx, (R128_U32)x)) {
+      return 63 - (int)idx;
+   } else {
+      return 64;
+   }
+#else
+   return x ? __builtin_clzll(x) : 64;
+#endif
+}
+
+#if !R128_64BIT
+// 32*32->64
+static R128_U64 r128__umul64(R128_U32 a, R128_U32 b)
+{
+#  if defined(_M_IX86) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
+   return __emulu(a, b);
+#  elif defined(_M_ARM) && !defined(R128_STDC_ONLY)
+   return _arm_umull(a, b);
+#  else
+   return a * (R128_U64)b;
+#  endif
+}
+
+// 64/32->32
+static R128_U32 r128__udiv64(R128_U32 nlo, R128_U32 nhi, R128_U32 d, R128_U32 *rem)
+{
+#  if defined(_M_IX86) && (_MSC_VER >= 1920) && !defined(R128_STDC_ONLY)
+   unsigned __int64 n = ((unsigned __int64)nhi << 32) | nlo;
+   return _udiv64(n, d, rem);
+#  elif defined(_M_IX86) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
+   __asm {
+      mov eax, nlo
+      mov edx, nhi
+      div d
+      mov ecx, rem
+      mov dword ptr [ecx], edx
+   }
+#  elif defined(__i386__) && !defined(R128_STDC_ONLY)
+   R128_U32 q, r;
+   __asm("divl %4"
+      : "=a"(q), "=d"(r)
+      : "a"(nlo), "d"(nhi), "X"(d));
+   *rem = r;
+   return q;
+#  else
+   R128_U64 n64 = ((R128_U64)nhi << 32) | nlo;
+   *rem = (R128_U32)(n64 % d);
+   return (R128_U32)(n64 / d);
+#  endif
+}
+#elif defined(R128_STDC_ONLY) || !R128_INTEL
+#define r128__umul64(a, b) ((a) * (R128_U64)(b))
+static R128_U32 r128__udiv64(R128_U32 nlo, R128_U32 nhi, R128_U32 d, R128_U32 *rem)
+{
+   R128_U64 n64 = ((R128_U64)nhi << 32) | nlo;
+   *rem = (R128_U32)(n64 % d);
+   return (R128_U32)(n64 / d);
+}
+#endif   //!R128_64BIT
+
+static void r128__neg(R128 *dst, const R128 *src)
+{
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(src != NULL);
+
+#if R128_INTEL && !defined(R128_STDC_ONLY)
+   {
+      unsigned char carry = 0;
+#  if R128_64BIT
+      carry = _addcarry_u64(carry, ~src->lo, 1, &dst->lo);
+      carry = _addcarry_u64(carry, ~src->hi, 0, &dst->hi);
+#  else
+      R128_U32 r0, r1, r2, r3;
+      carry = _addcarry_u32(carry, ~R128_R0(src), 1, &r0);
+      carry = _addcarry_u32(carry, ~R128_R1(src), 0, &r1);
+      carry = _addcarry_u32(carry, ~R128_R2(src), 0, &r2);
+      carry = _addcarry_u32(carry, ~R128_R3(src), 0, &r3);
+      R128_SET4(dst, r0, r1, r2, r3);
+#  endif //R128_64BIT
+   }
+#else
+   if (src->lo) {
+      dst->lo = ~src->lo + 1;
+      dst->hi = ~src->hi;
+   } else {
+      dst->lo = 0;
+      dst->hi = ~src->hi + 1;
+   }
+#endif   //R128_INTEL
+}
+
+// 64*64->128
+static void r128__umul128(R128 *dst, R128_U64 a, R128_U64 b)
+{
+#if defined(_M_X64) && !defined(R128_STDC_ONLY)
+   dst->lo = _umul128(a, b, &dst->hi);
+#elif R128_64BIT && !defined(_MSC_VER) && !defined(R128_STDC_ONLY)
+   unsigned __int128 p0 = a * (unsigned __int128)b;
+   dst->hi = (R128_U64)(p0 >> 64);
+   dst->lo = (R128_U64)p0;
+#else
+   R128_U32 alo = (R128_U32)a;
+   R128_U32 ahi = (R128_U32)(a >> 32);
+   R128_U32 blo = (R128_U32)b;
+   R128_U32 bhi = (R128_U32)(b >> 32);
+   R128_U64 p0, p1, p2, p3;
+
+   p0 = r128__umul64(alo, blo);
+   p1 = r128__umul64(alo, bhi);
+   p2 = r128__umul64(ahi, blo);
+   p3 = r128__umul64(ahi, bhi);
+
+   {
+#if R128_INTEL && !defined(R128_STDC_ONLY)
+      R128_U32 r0, r1, r2, r3;
+      unsigned char carry;
+
+      r0 = (R128_U32)(p0);
+      r1 = (R128_U32)(p0 >> 32);
+      r2 = (R128_U32)(p1 >> 32);
+      r3 = (R128_U32)(p3 >> 32);
+
+      carry = _addcarry_u32(0, r1, (R128_U32)p1, &r1);
+      carry = _addcarry_u32(carry, r2, (R128_U32)(p2 >> 32), &r2);
+      _addcarry_u32(carry, r3, 0, &r3);
+      carry = _addcarry_u32(0, r1, (R128_U32)p2, &r1);
+      carry = _addcarry_u32(carry, r2, (R128_U32)p3, &r2);
+      _addcarry_u32(carry, r3, 0, &r3);
+
+      R128_SET4(dst, r0, r1, r2, r3);
+#else
+      R128_U64 carry, lo, hi;
+      carry = ((R128_U64)(R128_U32)p1 + (R128_U64)(R128_U32)p2 + (p0 >> 32)) >> 32;
+
+      lo = p0 + ((p1 + p2) << 32);
+      hi = p3 + ((R128_U32)(p1 >> 32) + (R128_U32)(p2 >> 32)) + carry;
+
+      R128_SET2(dst, lo, hi);
+#endif
+   }
+#endif
+}
+
+// 128/64->64
+#if defined(_M_X64) && (_MSC_VER < 1920) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
+// MSVC x64 provides neither inline assembly nor (pre-2019) a div intrinsic, so we do fake
+// "inline assembly" to avoid long division or outline assembly.
+#pragma code_seg(".text")
+__declspec(allocate(".text") align(16)) static const unsigned char r128__udiv128Code[] = {
+   0x48, 0x8B, 0xC1,       //mov  rax, rcx
+   0x49, 0xF7, 0xF0,       //div  rax, r8
+   0x49, 0x89, 0x11,       //mov  qword ptr [r9], rdx
+   0xC3                    //ret
+};
+typedef R128_U64 (*r128__udiv128Proc)(R128_U64 nlo, R128_U64 nhi, R128_U64 d, R128_U64 *rem);
+static const r128__udiv128Proc r128__udiv128 = (r128__udiv128Proc)(void*)r128__udiv128Code;
+#else
+static R128_U64 r128__udiv128(R128_U64 nlo, R128_U64 nhi, R128_U64 d, R128_U64 *rem)
+{
+#if defined(_M_X64) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
+   return _udiv128(nhi, nlo, d, rem);
+#elif defined(__x86_64__) && !defined(R128_STDC_ONLY)
+   R128_U64 q, r;
+   __asm("divq %4"
+      : "=a"(q), "=d"(r)
+      : "a"(nlo), "d"(nhi), "X"(d));
+   *rem = r;
+   return q;
+#else
+   R128_U64 tmp;
+   R128_U32 d0, d1;
+   R128_U32 n3, n2, n1, n0;
+   R128_U32 q0, q1;
+   R128_U32 r;
+   int shift;
+
+   R128_ASSERT(d != 0);    //division by zero
+   R128_ASSERT(nhi < d);   //overflow
+
+   // normalize
+   shift = r128__clz64(d);
+
+   if (shift) {
+      R128 tmp128;
+      R128_SET2(&tmp128, nlo, nhi);
+      r128Shl(&tmp128, &tmp128, shift);
+      n3 = R128_R3(&tmp128);
+      n2 = R128_R2(&tmp128);
+      n1 = R128_R1(&tmp128);
+      n0 = R128_R0(&tmp128);
+      d <<= shift;
+   } else {
+      n3 = (R128_U32)(nhi >> 32);
+      n2 = (R128_U32)nhi;
+      n1 = (R128_U32)(nlo >> 32);
+      n0 = (R128_U32)nlo;
+   }
+
+   d1 = (R128_U32)(d >> 32);
+   d0 = (R128_U32)d;
+
+   // first digit
+   R128_ASSERT(n3 <= d1);
+   if (n3 < d1) {
+      q1 = r128__udiv64(n2, n3, d1, &r);
+   } else {
+      q1 = 0xffffffffu;
+      r = n2 + d1;
+   }
+refine1:
+   if (r128__umul64(q1, d0) > ((R128_U64)r << 32) + n1) {
+      --q1;
+      if (r < ~d1 + 1) {
+         r += d1;
+         goto refine1;
+      }
+   }
+
+   tmp = ((R128_U64)n2 << 32) + n1 - (r128__umul64(q1, d0) + (r128__umul64(q1, d1) << 32));
+   n2 = (R128_U32)(tmp >> 32);
+   n1 = (R128_U32)tmp;
+
+   // second digit
+   R128_ASSERT(n2 <= d1);
+   if (n2 < d1) {
+      q0 = r128__udiv64(n1, n2, d1, &r);
+   } else {
+      q0 = 0xffffffffu;
+      r = n1 + d1;
+   }
+refine0:
+   if (r128__umul64(q0, d0) > ((R128_U64)r << 32) + n0) {
+      --q0;
+      if (r < ~d1 + 1) {
+         r += d1;
+         goto refine0;
+      }
+   }
+
+   tmp = ((R128_U64)n1 << 32) + n0 - (r128__umul64(q0, d0) + (r128__umul64(q0, d1) << 32));
+   n1 = (R128_U32)(tmp >> 32);
+   n0 = (R128_U32)tmp;
+
+   *rem = (((R128_U64)n1 << 32) + n0) >> shift;
+   return ((R128_U64)q1 << 32) + q0;
+#endif
+}
+#endif
+
+static int r128__ucmp(const R128 *a, const R128 *b)
+{
+   if (a->hi != b->hi) {
+      if (a->hi > b->hi) {
+         return 1;
+      } else {
+         return -1;
+      }
+   } else {
+      if (a->lo == b->lo) {
+         return 0;
+      } else if (a->lo > b->lo) {
+         return 1;
+      } else {
+         return -1;
+      }
+   }
+}
+
+static void r128__umul(R128 *dst, const R128 *a, const R128 *b)
+{
+#if defined(_M_X64) && !defined(R128_STDC_ONLY)
+   R128_U64 t0, t1;
+   R128_U64 lo, hi = 0;
+   unsigned char carry;
+
+   t0 = _umul128(a->lo, b->lo, &t1);
+   carry = _addcarry_u64(0, t1, t0 >> 63, &lo);
+   _addcarry_u64(carry, hi, hi, &hi);
+
+   t0 = _umul128(a->lo, b->hi, &t1);
+   carry = _addcarry_u64(0, lo, t0, &lo);
+   _addcarry_u64(carry, hi, t1, &hi);
+
+   t0 = _umul128(a->hi, b->lo, &t1);
+   carry = _addcarry_u64(0, lo, t0, &lo);
+   _addcarry_u64(carry, hi, t1, &hi);
+
+   t0 = _umul128(a->hi, b->hi, &t1);
+   hi += t0;
+
+   R128_SET2(dst, lo, hi);
+#elif defined(__x86_64__) && !defined(R128_STDC_ONLY)
+   unsigned __int128 p0, p1, p2, p3;
+   p0 = a->lo * (unsigned __int128)b->lo;
+   p1 = a->lo * (unsigned __int128)b->hi;
+   p2 = a->hi * (unsigned __int128)b->lo;
+   p3 = a->hi * (unsigned __int128)b->hi;
+
+   p0 = (p3 << 64) + p2 + p1 + (p0 >> 64) + ((R128_U64)p0 >> 63);
+   dst->lo = (R128_U64)p0;
+   dst->hi = (R128_U64)(p0 >> 64);
+#else
+   R128 p0, p1, p2, p3, round;
+
+   r128__umul128(&p0, a->lo, b->lo);
+   round.hi = 0; round.lo = p0.lo >> 63;
+   p0.lo = p0.hi; p0.hi = 0; //r128Shr(&p0, &p0, 64);
+   r128Add(&p0, &p0, &round);
+
+   r128__umul128(&p1, a->hi, b->lo);
+   r128Add(&p0, &p0, &p1);
+
+   r128__umul128(&p2, a->lo, b->hi);
+   r128Add(&p0, &p0, &p2);
+
+   r128__umul128(&p3, a->hi, b->hi);
+   p3.hi = p3.lo; p3.lo = 0; //r128Shl(&p3, &p3, 64);
+   r128Add(&p0, &p0, &p3);
+
+   R128_SET2(dst, p0.lo, p0.hi);
+#endif
+}
+
+// Shift d left until the high bit is set, and shift n left by the same amount.
+// returns non-zero on overflow.
+static int r128__norm(R128 *n, R128 *d, R128_U64 *n2)
+{
+   R128_U64 d0, d1;
+   R128_U64 n0, n1;
+   int shift;
+
+   d1 = d->hi;
+   d0 = d->lo;
+   n1 = n->hi;
+   n0 = n->lo;
+
+   if (d1) {
+      shift = r128__clz64(d1);
+      if (shift) {
+         d1 = (d1 << shift) | (d0 >> (64 - shift));
+         d0 = d0 << shift;
+         *n2 = n1 >> (64 - shift);
+         n1 = (n1 << shift) | (n0 >> (64 - shift));
+         n0 = n0 << shift;
+      } else {
+         *n2 = 0;
+      }
+   } else {
+      shift = r128__clz64(d0);
+      if (r128__clz64(n1) <= shift) {
+         return 1; // overflow
+      }
+
+      if (shift) {
+         d1 = d0 << shift;
+         d0 = 0;
+         *n2 = (n1 << shift) | (n0 >> (64 - shift));
+         n1 = n0 << shift;
+         n0 = 0;
+      } else {
+         d1 = d0;
+         d0 = 0;
+         *n2 = n1;
+         n1 = n0;
+         n0 = 0;
+      }
+   }
+
+   R128_SET2(n, n0, n1);
+   R128_SET2(d, d0, d1);
+   return 0;
+}
+
+static void r128__udiv(R128 *quotient, const R128 *dividend, const R128 *divisor)
+{
+   R128 tmp;
+   R128_U64 d0, d1;
+   R128_U64 n1, n2, n3;
+   R128 q;
+
+   R128_ASSERT(dividend != NULL);
+   R128_ASSERT(divisor != NULL);
+   R128_ASSERT(quotient != NULL);
+   R128_ASSERT(divisor->hi != 0 || divisor->lo != 0);  // divide by zero
+
+   // scale dividend and normalize
+   {
+      R128 n, d;
+      R128_SET2(&n, dividend->lo, dividend->hi);
+      R128_SET2(&d, divisor->lo, divisor->hi);
+      if (r128__norm(&n, &d, &n3)) {
+         R128_SET2(quotient, R128_max.lo, R128_max.hi);
+         return;
+      }
+
+      d1 = d.hi;
+      d0 = d.lo;
+      n2 = n.hi;
+      n1 = n.lo;
+   }
+
+   // first digit
+   R128_ASSERT(n3 <= d1);
+   {
+      R128 t0, t1;
+      t0.lo = n1;
+      if (n3 < d1) {
+         q.hi = r128__udiv128(n2, n3, d1, &t0.hi);
+      } else {
+         q.hi = R128_LIT_U64(0xffffffffffffffff);
+         t0.hi = n2 + d1;
+      }
+
+refine1:
+      r128__umul128(&t1, q.hi, d0);
+      if (r128__ucmp(&t1, &t0) > 0) {
+         --q.hi;
+         if (t0.hi < ~d1 + 1) {
+            t0.hi += d1;
+            goto refine1;
+         }
+      }
+   }
+
+   {
+      R128 t0, t1, t2;
+      t0.hi = n2;
+      t0.lo = n1;
+
+      r128__umul128(&t1, q.hi, d0);
+      r128__umul128(&t2, q.hi, d1);
+
+      t2.hi = t2.lo; t2.lo = 0;  //r128Shl(&t2, &t2, 64);
+      r128Add(&tmp, &t1, &t2);
+      r128Sub(&tmp, &t0, &tmp);
+   }
+   n2 = tmp.hi;
+   n1 = tmp.lo;
+
+   // second digit
+   R128_ASSERT(n2 <= d1);
+   {
+      R128 t0, t1;
+      t0.lo = 0;
+      if (n2 < d1) {
+         q.lo = r128__udiv128(n1, n2, d1, &t0.hi);
+      } else {
+         q.lo = R128_LIT_U64(0xffffffffffffffff);
+         t0.hi = n1 + d1;
+      }
+
+   refine0:
+      r128__umul128(&t1, q.lo, d0);
+      if (r128__ucmp(&t1, &t0) > 0) {
+         --q.lo;
+         if (t0.hi < ~d1 + 1) {
+            t0.hi += d1;
+            goto refine0;
+         }
+      }
+   }
+
+   R128_SET2(quotient, q.lo, q.hi);
+}
+
+static R128_U64 r128__umod(R128 *n, R128 *d)
+{
+   R128_U64 d0, d1;
+   R128_U64 n3, n2, n1;
+   R128_U64 q;
+
+   R128_ASSERT(d != NULL);
+   R128_ASSERT(n != NULL);
+   R128_ASSERT(d->hi != 0 || d->lo != 0);  // divide by zero
+
+   if (r128__norm(n, d, &n3)) {
+      return R128_LIT_U64(0xffffffffffffffff);
+   }
+
+   d1 = d->hi;
+   d0 = d->lo;
+   n2 = n->hi;
+   n1 = n->lo;
+
+   R128_ASSERT(n3 < d1);
+   {
+      R128 t0, t1;
+      t0.lo = n1;
+      q = r128__udiv128(n2, n3, d1, &t0.hi);
+
+   refine1:
+      r128__umul128(&t1, q, d0);
+      if (r128__ucmp(&t1, &t0) > 0) {
+         --q;
+         if (t0.hi < ~d1 + 1) {
+            t0.hi += d1;
+            goto refine1;
+         }
+      }
+   }
+
+   return q;
+}
+
+static int r128__format(char *dst, size_t dstSize, const R128 *v, const R128ToStringFormat *format)
+{
+   char buf[128];
+   R128 tmp;
+   R128_U64 whole;
+   char *cursor, *decimal, *dstp = dst;
+   int sign = 0;
+   int fullPrecision = 1;
+   int width, precision;
+   int padCnt, trail = 0;
+
+   R128_ASSERT(dst != NULL && dstSize > 0);
+   R128_ASSERT(v != NULL);
+   R128_ASSERT(format != NULL);
+
+   --dstSize;
+
+   R128_SET2(&tmp, v->lo, v->hi);
+   if (r128IsNeg(&tmp)) {
+      r128__neg(&tmp, &tmp);
+      sign = 1;
+   }
+
+   width = format->width;
+   if (width < 0) {
+      width = 0;
+   }
+
+   precision = format->precision;
+   if (precision < 0) {
+      // print a maximum of 20 digits
+      fullPrecision = 0;
+      precision = 20;
+   } else if (precision > sizeof(buf) - 21) {
+      trail = precision - (sizeof(buf) - 21);
+      precision -= trail;
+   }
+
+   whole = tmp.hi;
+   decimal = cursor = buf;
+
+   // fractional part first in case a carry into the whole part is required
+   if (tmp.lo || format->decimal) {
+      while (tmp.lo || (fullPrecision && precision)) {
+         if ((int)(cursor - buf) == precision) {
+            if ((R128_S64)tmp.lo < 0) {
+               // round up, propagate carry backwards
+               char *c;
+               for (c = cursor - 1; c >= buf; --c) {
+                  char d = ++*c;
+                  if (d <= '9') {
+                     goto endfrac;
+                  } else {
+                     *c = '0';
+                  }
+               }
+
+               // carry out into the whole part
+               whole++;
+            }
+
+            break;
+         }
+
+         r128__umul128(&tmp, tmp.lo, 10);
+         *cursor++ = (char)tmp.hi + '0';
+      }
+
+   endfrac:
+      if (format->decimal || precision) {
+         decimal = cursor;
+         *cursor++ = R128_decimal;
+      }
+   }
+
+   // whole part
+   do {
+      char digit = (char)(whole % 10);
+      whole /= 10;
+      *cursor++ = digit + '0';
+   } while (whole);
+
+#define R128__WRITE(c) do { if (dstp < dst + dstSize) *dstp = c; ++dstp; } while(0)
+
+   padCnt = width - (int)(cursor - buf) - 1;
+
+   // left padding
+   if (!format->leftAlign) {
+      char padChar = format->zeroPad ? '0' : ' ';
+      if (format->zeroPad) {
+         if (sign) {
+            R128__WRITE('-');
+         } else if (format->sign == R128ToStringSign_Plus) {
+            R128__WRITE('+');
+         } else if (format->sign == R128ToStringSign_Space) {
+            R128__WRITE(' ');
+         } else {
+            ++padCnt;
+         }
+      }
+
+      for (; padCnt > 0; --padCnt) {
+         R128__WRITE(padChar);
+      }
+   }
+
+   if (format->leftAlign || !format->zeroPad) {
+      if (sign) {
+         R128__WRITE('-');
+      } else if (format->sign == R128ToStringSign_Plus) {
+         R128__WRITE('+');
+      } else if (format->sign == R128ToStringSign_Space) {
+         R128__WRITE(' ');
+      } else {
+         ++padCnt;
+      }
+   }
+
+   {
+      char *i;
+
+      // reverse the whole part
+      for (i = cursor - 1; i >= decimal; --i) {
+         R128__WRITE(*i);
+      }
+
+      // copy the fractional part
+      for (i = buf; i < decimal; ++i) {
+         R128__WRITE(*i);
+      }
+   }
+
+   // right padding
+   if (format->leftAlign) {
+      char padChar = format->zeroPad ? '0' : ' ';
+      for (; padCnt > 0; --padCnt) {
+         R128__WRITE(padChar);
+      }
+   }
+
+   // trailing zeroes for very large precision
+   while (trail--) {
+      R128__WRITE('0');
+   }
+
+#undef R128__WRITE
+
+   if (dstp <= dst + dstSize) {
+      *dstp = '\0';
+   } else {
+      dst[dstSize] = '\0';
+   }
+   return (int)(dstp - dst);
+}
+
+void r128FromInt(R128 *dst, R128_S64 v)
+{
+   R128_ASSERT(dst != NULL);
+   dst->lo = 0;
+   dst->hi = (R128_U64)v;
+   R128_DEBUG_SET(dst);
+}
+
+void r128FromFloat(R128 *dst, double v)
+{
+   R128_ASSERT(dst != NULL);
+
+   if (v < -9223372036854775808.0) {
+      r128Copy(dst, &R128_min);
+   } else if (v >= 9223372036854775808.0) {
+      r128Copy(dst, &R128_max);
+   } else {
+      R128 r;
+      int sign = 0;
+
+      if (v < 0) {
+         v = -v;
+         sign = 1;
+      }
+
+      r.hi = (R128_U64)(R128_S64)v;
+      v -= (R128_S64)v;
+      r.lo = (R128_U64)(v * 18446744073709551616.0);
+
+      if (sign) {
+         r128__neg(&r, &r);
+      }
+
+      r128Copy(dst, &r);
+   }
+}
+
+void r128FromString(R128 *dst, const char *s, char **endptr)
+{
+   R128_U64 lo = 0, hi = 0;
+   R128_U64 base = 10;
+
+   int sign = 0;
+
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(s != NULL);
+
+   R128_SET2(dst, 0, 0);
+
+   // consume whitespace
+   for (;;) {
+      if (*s == ' ' || *s == '\t' || *s == '\r' || *s == '\n' || *s == '\v') {
+         ++s;
+      } else {
+         break;
+      }
+   }
+
+   // sign
+   if (*s == '-') {
+      sign = 1;
+      ++s;
+   } else if (*s == '+') {
+      ++s;
+   }
+
+   // parse base prefix
+   if (s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) {
+      base = 16;
+      s += 2;
+   }
+
+   // whole part
+   for (;; ++s) {
+      R128_U64 digit;
+
+      if ('0' <= *s && *s <= '9') {
+         digit = *s - '0';
+      } else if (base == 16 && 'a' <= *s && *s <= 'f') {
+         digit = *s - 'a' + 10;
+      } else if (base == 16 && 'A' <= *s && *s <= 'F') {
+         digit = *s - 'A' + 10;
+      } else {
+         break;
+      }
+
+      hi = hi * base + digit;
+   }
+
+   // fractional part
+   if (*s == R128_decimal) {
+      const char *exp = ++s;
+
+      // find the last digit and work backwards
+      for (;; ++s) {
+         if ('0' <= *s && *s <= '9') {
+         } else if (base == 16 && ('a' <= *s && *s <= 'f')) {
+         } else if (base == 16 && ('A' <= *s && *s <= 'F')) {
+         } else {
+            break;
+         }
+      }
+
+      for (--s; s >= exp; --s) {
+         R128_U64 digit, unused;
+
+         if ('0' <= *s && *s <= '9') {
+            digit = *s - '0';
+         } else if ('a' <= *s && *s <= 'f') {
+            digit = *s - 'a' + 10;
+         } else {
+            digit = *s - 'A' + 10;
+         }
+
+         lo = r128__udiv128(lo, digit, base, &unused);
+      }
+   }
+
+   R128_SET2(dst, lo, hi);
+   if (sign) {
+      r128__neg(dst, dst);
+   }
+
+   if (endptr) {
+      *endptr = (char *) s;
+   }
+}
+
+R128_S64 r128ToInt(const R128 *v)
+{
+   R128_ASSERT(v != NULL);
+   return (R128_S64)v->hi;
+}
+
+double r128ToFloat(const R128 *v)
+{
+   R128 tmp;
+   int sign = 0;
+   double d;
+
+   R128_ASSERT(v != NULL);
+
+   R128_SET2(&tmp, v->lo, v->hi);
+   if (r128IsNeg(&tmp)) {
+      r128__neg(&tmp, &tmp);
+      sign = 1;
+   }
+
+   d = tmp.hi + tmp.lo * (1 / 18446744073709551616.0);
+   if (sign) {
+      d = -d;
+   }
+
+   return d;
+}
+
+int r128ToStringOpt(char *dst, size_t dstSize, const R128 *v, const R128ToStringFormat *opt)
+{
+   return r128__format(dst, dstSize, v, opt);
+}
+
+int r128ToStringf(char *dst, size_t dstSize, const char *format, const R128 *v)
+{
+   R128ToStringFormat opts;
+
+   R128_ASSERT(dst != NULL && dstSize);
+   R128_ASSERT(format != NULL);
+   R128_ASSERT(v != NULL);
+
+   opts.sign = R128__defaultFormat.sign;
+   opts.precision = R128__defaultFormat.precision;
+   opts.zeroPad = R128__defaultFormat.zeroPad;
+   opts.decimal = R128__defaultFormat.decimal;
+   opts.leftAlign = R128__defaultFormat.leftAlign;
+
+   if (*format == '%') {
+      ++format;
+   }
+
+   // flags field
+   for (;; ++format) {
+      if (*format == ' ' && opts.sign != R128ToStringSign_Plus) {
+         opts.sign = R128ToStringSign_Space;
+      } else if (*format == '+') {
+         opts.sign = R128ToStringSign_Plus;
+      } else if (*format == '0') {
+         opts.zeroPad = 1;
+      } else if (*format == '-') {
+         opts.leftAlign = 1;
+      } else if (*format == '#') {
+         opts.decimal = 1;
+      } else {
+         break;
+      }
+   }
+
+   // width field
+   opts.width = 0;
+   for (;;) {
+      if ('0' <= *format && *format <= '9') {
+         opts.width = opts.width * 10 + *format++ - '0';
+      } else {
+         break;
+      }
+   }
+
+   // precision field
+   if (*format == '.') {
+      opts.precision = 0;
+      ++format;
+      for (;;) {
+         if ('0' <= *format && *format <= '9') {
+            opts.precision = opts.precision * 10 + *format++ - '0';
+         } else {
+            break;
+         }
+      }
+   }
+
+   return r128__format(dst, dstSize, v, &opts);
+}
+
+int r128ToString(char *dst, size_t dstSize, const R128 *v)
+{
+   return r128__format(dst, dstSize, v, &R128__defaultFormat);
+}
+
+void r128Copy(R128 *dst, const R128 *src)
+{
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(src != NULL);
+   dst->lo = src->lo;
+   dst->hi = src->hi;
+   R128_DEBUG_SET(dst);
+}
+
+void r128Neg(R128 *dst, const R128 *src)
+{
+   r128__neg(dst, src);
+   R128_DEBUG_SET(dst);
+}
+
+void r128Not(R128 *dst, const R128 *src)
+{
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(src != NULL);
+
+   dst->lo = ~src->lo;
+   dst->hi = ~src->hi;
+   R128_DEBUG_SET(dst);
+}
+
+void r128Or(R128 *dst, const R128 *a, const R128 *b)
+{
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(a != NULL);
+   R128_ASSERT(b != NULL);
+
+   dst->lo = a->lo | b->lo;
+   dst->hi = a->hi | b->hi;
+   R128_DEBUG_SET(dst);
+}
+
+void r128And(R128 *dst, const R128 *a, const R128 *b)
+{
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(a != NULL);
+   R128_ASSERT(b != NULL);
+
+   dst->lo = a->lo & b->lo;
+   dst->hi = a->hi & b->hi;
+   R128_DEBUG_SET(dst);
+}
+
+void r128Xor(R128 *dst, const R128 *a, const R128 *b)
+{
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(a != NULL);
+   R128_ASSERT(b != NULL);
+
+   dst->lo = a->lo ^ b->lo;
+   dst->hi = a->hi ^ b->hi;
+   R128_DEBUG_SET(dst);
+}
+
+void r128Shl(R128 *dst, const R128 *src, int amount)
+{
+   R128_U64 r[4];
+
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(src != NULL);
+
+#if defined(_M_IX86) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
+   __asm {
+      // load src
+      mov edx, dword ptr[src]
+      mov ecx, amount
+
+      mov edi, dword ptr[edx]
+      mov esi, dword ptr[edx + 4]
+      mov ebx, dword ptr[edx + 8]
+      mov eax, dword ptr[edx + 12]
+
+      // shift mod 32
+      shld eax, ebx, cl
+      shld ebx, esi, cl
+      shld esi, edi, cl
+      shl edi, cl
+
+      // clear out low 12 bytes of stack
+      xor edx, edx
+      mov dword ptr[r], edx
+      mov dword ptr[r + 4], edx
+      mov dword ptr[r + 8], edx
+
+      // store shifted amount offset by count/32 bits
+      shr ecx, 5
+      and ecx, 3
+      mov dword ptr[r + ecx * 4 + 0], edi
+      mov dword ptr[r + ecx * 4 + 4], esi
+      mov dword ptr[r + ecx * 4 + 8], ebx
+      mov dword ptr[r + ecx * 4 + 12], eax
+   }
+#else
+
+   r[0] = src->lo;
+   r[1] = src->hi;
+
+   amount &= 127;
+   if (amount >= 64) {
+      r[1] = r[0] << (amount - 64);
+      r[0] = 0;
+   } else if (amount) {
+#  ifdef _M_X64
+      r[1] = __shiftleft128(r[0], r[1], (char) amount);
+#  else
+      r[1] = (r[1] << amount) | (r[0] >> (64 - amount));
+#  endif
+      r[0] = r[0] << amount;
+   }
+#endif   //_M_IX86
+
+   dst->lo = r[0];
+   dst->hi = r[1];
+   R128_DEBUG_SET(dst);
+}
+
+void r128Shr(R128 *dst, const R128 *src, int amount)
+{
+   R128_U64 r[4];
+
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(src != NULL);
+
+#if defined(_M_IX86) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
+   __asm {
+      // load src
+      mov edx, dword ptr[src]
+      mov ecx, amount
+
+      mov edi, dword ptr[edx]
+      mov esi, dword ptr[edx + 4]
+      mov ebx, dword ptr[edx + 8]
+      mov eax, dword ptr[edx + 12]
+
+      // shift mod 32
+      shrd edi, esi, cl
+      shrd esi, ebx, cl
+      shrd ebx, eax, cl
+      shr eax, cl
+
+      // clear out high 12 bytes of stack
+      xor edx, edx
+      mov dword ptr[r + 20], edx
+      mov dword ptr[r + 24], edx
+      mov dword ptr[r + 28], edx
+
+      // store shifted amount offset by -count/32 bits
+      shr ecx, 5
+      and ecx, 3
+      neg ecx
+      mov dword ptr[r + ecx * 4 + 16], edi
+      mov dword ptr[r + ecx * 4 + 20], esi
+      mov dword ptr[r + ecx * 4 + 24], ebx
+      mov dword ptr[r + ecx * 4 + 28], eax
+   }
+#else
+   r[2] = src->lo;
+   r[3] = src->hi;
+
+   amount &= 127;
+   if (amount >= 64) {
+      r[2] = r[3] >> (amount - 64);
+      r[3] = 0;
+   } else if (amount) {
+#ifdef _M_X64
+      r[2] = __shiftright128(r[2], r[3], (char) amount);
+#else
+      r[2] = (r[2] >> amount) | (r[3] << (64 - amount));
+#endif
+      r[3] = r[3] >> amount;
+   }
+#endif
+
+   dst->lo = r[2];
+   dst->hi = r[3];
+   R128_DEBUG_SET(dst);
+}
+
+void r128Sar(R128 *dst, const R128 *src, int amount)
+{
+   R128_U64 r[4];
+
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(src != NULL);
+
+#if defined(_M_IX86) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
+   __asm {
+      // load src
+      mov edx, dword ptr[src]
+      mov ecx, amount
+
+      mov edi, dword ptr[edx]
+      mov esi, dword ptr[edx + 4]
+      mov ebx, dword ptr[edx + 8]
+      mov eax, dword ptr[edx + 12]
+
+      // shift mod 32
+      shrd edi, esi, cl
+      shrd esi, ebx, cl
+      shrd ebx, eax, cl
+      sar eax, cl
+
+      // copy sign to high 12 bytes of stack
+      cdq
+      mov dword ptr[r + 20], edx
+      mov dword ptr[r + 24], edx
+      mov dword ptr[r + 28], edx
+
+      // store shifted amount offset by -count/32 bits
+      shr ecx, 5
+      and ecx, 3
+      neg ecx
+      mov dword ptr[r + ecx * 4 + 16], edi
+      mov dword ptr[r + ecx * 4 + 20], esi
+      mov dword ptr[r + ecx * 4 + 24], ebx
+      mov dword ptr[r + ecx * 4 + 28], eax
+   }
+#else
+   r[2] = src->lo;
+   r[3] = src->hi;
+
+   amount &= 127;
+   if (amount >= 64) {
+      r[2] = (R128_U64)((R128_S64)r[3] >> (amount - 64));
+      r[3] = (R128_U64)((R128_S64)r[3] >> 63);
+   } else if (amount) {
+      r[2] = (r[2] >> amount) | (R128_U64)((R128_S64)r[3] << (64 - amount));
+      r[3] = (R128_U64)((R128_S64)r[3] >> amount);
+   }
+#endif
+
+   dst->lo = r[2];
+   dst->hi = r[3];
+   R128_DEBUG_SET(dst);
+}
+
+void r128Add(R128 *dst, const R128 *a, const R128 *b)
+{
+   unsigned char carry = 0;
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(a != NULL);
+   R128_ASSERT(b != NULL);
+
+#if R128_INTEL && !defined(R128_STDC_ONLY)
+#  if R128_64BIT
+   carry = _addcarry_u64(carry, a->lo, b->lo, &dst->lo);
+   carry = _addcarry_u64(carry, a->hi, b->hi, &dst->hi);
+#  else
+   R128_U32 r0, r1, r2, r3;
+   carry = _addcarry_u32(carry, R128_R0(a), R128_R0(b), &r0);
+   carry = _addcarry_u32(carry, R128_R1(a), R128_R1(b), &r1);
+   carry = _addcarry_u32(carry, R128_R2(a), R128_R2(b), &r2);
+   carry = _addcarry_u32(carry, R128_R3(a), R128_R3(b), &r3);
+   R128_SET4(dst, r0, r1, r2, r3);
+#  endif //R128_64BIT
+#else
+   {
+      R128_U64 r = a->lo + b->lo;
+      carry = r < a->lo;
+      dst->lo = r;
+      dst->hi = a->hi + b->hi + carry;
+   }
+#endif   //R128_INTEL
+
+   R128_DEBUG_SET(dst);
+}
+
+void r128Sub(R128 *dst, const R128 *a, const R128 *b)
+{
+   unsigned char borrow = 0;
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(a != NULL);
+   R128_ASSERT(b != NULL);
+
+#if R128_INTEL && !defined(R128_STDC_ONLY)
+#  if R128_64BIT
+   borrow = _subborrow_u64(borrow, a->lo, b->lo, &dst->lo);
+   borrow = _subborrow_u64(borrow, a->hi, b->hi, &dst->hi);
+#  else
+   R128_U32 r0, r1, r2, r3;
+   borrow = _subborrow_u32(borrow, R128_R0(a), R128_R0(b), &r0);
+   borrow = _subborrow_u32(borrow, R128_R1(a), R128_R1(b), &r1);
+   borrow = _subborrow_u32(borrow, R128_R2(a), R128_R2(b), &r2);
+   borrow = _subborrow_u32(borrow, R128_R3(a), R128_R3(b), &r3);
+   R128_SET4(dst, r0, r1, r2, r3);
+#  endif //R128_64BIT
+#else
+   {
+      R128_U64 r = a->lo - b->lo;
+      borrow = r > a->lo;
+      dst->lo = r;
+      dst->hi = a->hi - b->hi - borrow;
+   }
+#endif   //R128_INTEL
+
+   R128_DEBUG_SET(dst);
+}
+
+void r128Mul(R128 *dst, const R128 *a, const R128 *b)
+{
+   int sign = 0;
+   R128 ta, tb, tc;
+
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(a != NULL);
+   R128_ASSERT(b != NULL);
+
+   R128_SET2(&ta, a->lo, a->hi);
+   R128_SET2(&tb, b->lo, b->hi);
+
+   if (r128IsNeg(&ta)) {
+      r128__neg(&ta, &ta);
+      sign = !sign;
+   }
+   if (r128IsNeg(&tb)) {
+      r128__neg(&tb, &tb);
+      sign = !sign;
+   }
+
+   r128__umul(&tc, &ta, &tb);
+   if (sign) {
+      r128__neg(&tc, &tc);
+   }
+
+   r128Copy(dst, &tc);
+}
+
+void r128Div(R128 *dst, const R128 *a, const R128 *b)
+{
+   int sign = 0;
+   R128 tn, td, tq;
+
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(a != NULL);
+   R128_ASSERT(b != NULL);
+
+   R128_SET2(&tn, a->lo, a->hi);
+   R128_SET2(&td, b->lo, b->hi);
+
+   if (r128IsNeg(&tn)) {
+      r128__neg(&tn, &tn);
+      sign = !sign;
+   }
+
+   if (td.lo == 0 && td.hi == 0) {
+      // divide by zero
+      if (sign) {
+         r128Copy(dst, &R128_min);
+      } else {
+         r128Copy(dst, &R128_max);
+      }
+      return;
+   } else if (r128IsNeg(&td)) {
+      r128__neg(&td, &td);
+      sign = !sign;
+   }
+
+   r128__udiv(&tq, &tn, &td);
+
+   if (sign) {
+      r128__neg(&tq, &tq);
+   }
+
+   r128Copy(dst, &tq);
+}
+
+void r128Mod(R128 *dst, const R128 *a, const R128 *b)
+{
+   int sign = 0;
+   R128 tn, td, tq;
+
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(a != NULL);
+   R128_ASSERT(b != NULL);
+
+   R128_SET2(&tn, a->lo, a->hi);
+   R128_SET2(&td, b->lo, b->hi);
+
+   if (r128IsNeg(&tn)) {
+      r128__neg(&tn, &tn);
+      sign = !sign;
+   }
+
+   if (td.lo == 0 && td.hi == 0) {
+      // divide by zero
+      if (sign) {
+         r128Copy(dst, &R128_min);
+      } else {
+         r128Copy(dst, &R128_max);
+      }
+      return;
+   } else if (r128IsNeg(&td)) {
+      r128__neg(&td, &td);
+      sign = !sign;
+   }
+
+   tq.hi = r128__umod(&tn, &td);
+   tq.lo = 0;
+
+   if (sign) {
+      tq.hi = ~tq.hi + 1;
+   }
+
+   r128Mul(&tq, &tq, b);
+   r128Sub(dst, a, &tq);
+}
+
+void r128Rsqrt(R128 *dst, const R128 *v)
+{
+   static const R128 threeHalves = { R128_LIT_U64(0x8000000000000000), 1 };
+   R128 x, est;
+   int i;
+
+   if ((R128_S64)v->hi < 0) {
+      r128Copy(dst, &R128_min);
+      return;
+   }
+
+   R128_SET2(&x, v->lo, v->hi);
+
+   // get initial estimate
+   if (x.hi) {
+      int shift = (64 + r128__clz64(x.hi)) >> 1;
+      est.lo = R128_LIT_U64(1) << shift;
+      est.hi = 0;
+   } else if (x.lo) {
+      int shift = r128__clz64(x.lo) >> 1;
+      est.hi = R128_LIT_U64(1) << shift;
+      est.lo = 0;
+   } else {
+      R128_SET2(dst, 0, 0);
+      return;
+   }
+
+   // x /= 2
+   r128Shr(&x, &x, 1);
+
+   // Newton-Raphson iterate
+   for (i = 0; i < 7; ++i) {
+      R128 newEst;
+
+      // newEst = est * (threeHalves - (x / 2) * est * est);
+      r128__umul(&newEst, &est, &est);
+      r128__umul(&newEst, &newEst, &x);
+      r128Sub(&newEst, &threeHalves, &newEst);
+      r128__umul(&newEst, &est, &newEst);
+
+      if (newEst.lo == est.lo && newEst.hi == est.hi) {
+         break;
+      }
+      R128_SET2(&est, newEst.lo, newEst.hi);
+   }
+
+   r128Copy(dst, &est);
+}
+
+void r128Sqrt(R128 *dst, const R128 *v)
+{
+   R128 x, est;
+   int i;
+
+   if ((R128_S64)v->hi < 0) {
+      r128Copy(dst, &R128_min);
+      return;
+   }
+
+   R128_SET2(&x, v->lo, v->hi);
+
+   // get initial estimate
+   if (x.hi) {
+      int shift = (63 - r128__clz64(x.hi)) >> 1;
+      r128Shr(&est, &x, shift);
+   } else if (x.lo) {
+      int shift = (1 + r128__clz64(x.lo)) >> 1;
+      r128Shl(&est, &x, shift);
+   } else {
+      R128_SET2(dst, 0, 0);
+      return;
+   }
+
+   // Newton-Raphson iterate
+   for (i = 0; i < 7; ++i) {
+      R128 newEst;
+
+      // newEst = (est + x / est) / 2
+      r128__udiv(&newEst, &x, &est);
+      r128Add(&newEst, &newEst, &est);
+      r128Shr(&newEst, &newEst, 1);
+
+      if (newEst.lo == est.lo && newEst.hi == est.hi) {
+         break;
+      }
+      R128_SET2(&est, newEst.lo, newEst.hi);
+   }
+
+   r128Copy(dst, &est);
+}
+
+int r128Cmp(const R128 *a, const R128 *b)
+{
+   R128_ASSERT(a != NULL);
+   R128_ASSERT(b != NULL);
+
+   if (a->hi == b->hi) {
+      if (a->lo == b->lo) {
+         return 0;
+      } else if (a->lo > b->lo) {
+         return 1;
+      } else {
+         return -1;
+      }
+   } else if ((R128_S64)a->hi > (R128_S64)b->hi) {
+      return 1;
+   } else {
+      return -1;
+   }
+}
+
+int r128IsNeg(const R128 *v)
+{
+   R128_ASSERT(v != NULL);
+
+   return (R128_S64)v->hi < 0;
+}
+
+void r128Min(R128 *dst, const R128 *a, const R128 *b)
+{
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(a != NULL);
+   R128_ASSERT(b != NULL);
+
+   if (r128Cmp(a, b) < 0) {
+      r128Copy(dst, a);
+   } else {
+      r128Copy(dst, b);
+   }
+}
+
+void r128Max(R128 *dst, const R128 *a, const R128 *b)
+{
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(a != NULL);
+   R128_ASSERT(b != NULL);
+
+   if (r128Cmp(a, b) > 0) {
+      r128Copy(dst, a);
+   } else {
+      r128Copy(dst, b);
+   }
+}
+
+void r128Floor(R128 *dst, const R128 *v)
+{
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(v != NULL);
+
+   if ((R128_S64)v->hi < 0) {
+      dst->hi = v->hi - (v->lo != 0);
+   } else {
+      dst->hi = v->hi;
+   }
+   dst->lo = 0;
+   R128_DEBUG_SET(dst);
+}
+
+void r128Ceil(R128 *dst, const R128 *v)
+{
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(v != NULL);
+
+   if ((R128_S64)v->hi > 0) {
+      dst->hi = v->hi + (v->lo != 0);
+   } else {
+      dst->hi = v->hi;
+   }
+   dst->lo = 0;
+   R128_DEBUG_SET(dst);
+}
+
+#endif   //R128_IMPLEMENTATION
diff --git a/mesh_utils.cpp b/mesh_utils.cpp
index e3f5ca5..ccbeea3 100644
--- a/mesh_utils.cpp
+++ b/mesh_utils.cpp
@@ -31,6 +31,10 @@ SOFTWARE.
 
 #if GODOT4
 #define Texture Texture2D
+
+#include "core/math/delaunay_3d.h"
+#else
+#include "delaunay/delaunay_3d.h"
 #endif
 
 MeshUtils *MeshUtils::_instance;
@@ -592,6 +596,29 @@ PoolVector2Array MeshUtils::uv_unwrap(Array arrays, bool p_block_align, float p_
 	return retarr;
 }
 
+PoolIntArray MeshUtils::delaunay3d_tetrahedralize(const Vector<Vector3> &p_points) {
+	Vector<Delaunay3D::OutputSimplex> data = Delaunay3D::tetrahedralize(p_points);
+
+	PoolIntArray ret;
+	ret.resize(data.size() * 4);
+	PoolIntArray::Write w = ret.write();
+
+	for (int i = 0; i < data.size(); ++i) {
+		int indx = i * 4;
+
+		const Delaunay3D::OutputSimplex &s = data[i];
+
+		w[indx] = s.points[0];
+		w[indx + 1] = s.points[1];
+		w[indx + 2] = s.points[2];
+		w[indx + 3] = s.points[3];
+	}
+
+	w.release();
+
+	return ret;
+}
+
 MeshUtils::MeshUtils() {
 	_instance = this;
 }
@@ -608,6 +635,8 @@ void MeshUtils::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("remove_doubles_interpolate_normals", "arr"), &MeshUtils::remove_doubles_interpolate_normals);
 
 	ClassDB::bind_method(D_METHOD("uv_unwrap", "arr", "block_align", "texel_size", "padding", "max_chart_size"), &MeshUtils::uv_unwrap, true, 0.05, 1, 4094);
+
+	ClassDB::bind_method(D_METHOD("delaunay3d_tetrahedralize", "points"), &MeshUtils::delaunay3d_tetrahedralize);
 }
 
 #if GODOT4
diff --git a/mesh_utils.h b/mesh_utils.h
index 1890b31..7138ab6 100644
--- a/mesh_utils.h
+++ b/mesh_utils.h
@@ -56,6 +56,8 @@ public:
 	//Only unwraps, does not create new seams
 	PoolVector2Array uv_unwrap(Array arr, bool p_block_align = true, float p_texel_size = 0.05, int p_padding = 1, int p_max_chart_size = 4094) const;
 
+	PoolIntArray delaunay3d_tetrahedralize(const Vector<Vector3> &p_points);
+
 	MeshUtils();
 	~MeshUtils();