Merge pull request #84384 from zeux/meshopt-update

meshoptimizer: Update to v0.20 (with a reduced patch)
2026-05-12 22:35:35 +00:00 · 2023-12-12 10:39:28 +01:00
parent d7d53cfa39 1f95053aeb
commit 5352490cc9
16 changed files with 681 additions and 866 deletions
@@ -325,7 +325,7 @@ License: Apache-2.0

 Files: ./thirdparty/meshoptimizer/
 Comment: meshoptimizer
-Copyright: 2016-2022, Arseny Kapoulkine
+Copyright: 2016-2023, Arseny Kapoulkine
 License: Expat

 Files: ./thirdparty/mingw-std-threads/
@@ -418,11 +418,10 @@ void ImporterMesh::generate_lods(float p_normal_merge_angle, float p_normal_spli
 			}
 		}

-		LocalVector<float> normal_weights;
-		normal_weights.resize(merged_vertex_count);
-		for (unsigned int j = 0; j < merged_vertex_count; j++) {
-			normal_weights[j] = 2.0; // Give some weight to normal preservation, may be worth exposing as an import setting
-		}
+		const float normal_weights[3] = {
+			// Give some weight to normal preservation, may be worth exposing as an import setting
+			2.0f, 2.0f, 2.0f
+		};

 		Vector<float> merged_vertices_f32 = vector3_to_float32_array(merged_vertices_ptr, merged_vertex_count);
 		float scale = SurfaceTool::simplify_scale_func(merged_vertices_f32.ptr(), merged_vertex_count, sizeof(float) * 3);
@@ -460,12 +459,13 @@ void ImporterMesh::generate_lods(float p_normal_merge_angle, float p_normal_spli
 					(const uint32_t *)merged_indices_ptr, index_count,
 					merged_vertices_f32.ptr(), merged_vertex_count,
 					sizeof(float) * 3, // Vertex stride
+					merged_normals_f32.ptr(),
+					sizeof(float) * 3, // Attribute stride
+					normal_weights, 3,
 					index_target,
 					max_mesh_error,
 					simplify_options,
-					&mesh_error,
-					merged_normals_f32.ptr(),
-					normal_weights.ptr(), 3);
+					&mesh_error);

 			if (new_index_count < last_index_count * 1.5f) {
 				index_target = index_target * 1.5f;
@@ -86,7 +86,7 @@ public:
 	static OptimizeVertexCacheFunc optimize_vertex_cache_func;
 	typedef size_t (*SimplifyFunc)(unsigned int *destination, const unsigned int *indices, size_t index_count, const float *vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float *r_error);
 	static SimplifyFunc simplify_func;
-	typedef size_t (*SimplifyWithAttribFunc)(unsigned int *destination, const unsigned int *indices, size_t index_count, const float *vertex_data, size_t vertex_count, size_t vertex_stride, size_t target_index_count, float target_error, unsigned int options, float *result_error, const float *attributes, const float *attribute_weights, size_t attribute_count);
+	typedef size_t (*SimplifyWithAttribFunc)(unsigned int *destination, const unsigned int *indices, size_t index_count, const float *vertex_data, size_t vertex_count, size_t vertex_stride, const float *attributes, size_t attribute_stride, const float *attribute_weights, size_t attribute_count, size_t target_index_count, float target_error, unsigned int options, float *result_error);
 	static SimplifyWithAttribFunc simplify_with_attrib_func;
 	typedef float (*SimplifyScaleFunc)(const float *vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 	static SimplifyScaleFunc simplify_scale_func;
@@ -501,7 +501,7 @@ File extracted from upstream release tarball:
 ## meshoptimizer

 - Upstream: https://github.com/zeux/meshoptimizer
- Version: git (4a287848fd664ae1c3fc8e5e008560534ceeb526, 2022)
+- Version: git (c21d3be6ddf627f8ca852ba4b6db9903b0557858, 2023)
 - License: MIT

 Files extracted from upstream repository:
@@ -509,10 +509,8 @@ Files extracted from upstream repository:
 - All files in `src/`
 - `LICENSE.md`

-An [experimental upstream feature](https://github.com/zeux/meshoptimizer/tree/simplify-attr),
-has been backported. On top of that, it was modified to report only distance
-error metrics instead of a combination of distance and attribute errors. Patches
-for both changes can be found in the `patches` directory.
+A patch is included to modify the simplifier to report only distance error
+metrics instead of a combination of distance and attribute errors.


 ## mingw-std-threads
@@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2016-2022 Arseny Kapoulkine
+Copyright (c) 2016-2023 Arseny Kapoulkine

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -13,7 +13,7 @@ namespace meshopt
 const unsigned char kIndexHeader = 0xe0;
 const unsigned char kSequenceHeader = 0xd0;

-static int gEncodeIndexVersion = 0;
+static int gEncodeIndexVersion = 1;

 typedef unsigned int VertexFifo[16];
 typedef unsigned int EdgeFifo[16][2];
@@ -157,7 +157,7 @@ static T* hashLookup(T* table, size_t buckets, const Hash& hash, const T& key, c
 	}

 	assert(false && "Hash table is full"); // unreachable
-	return 0;
+	return NULL;
 }

 static void buildPositionRemap(unsigned int* remap, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, meshopt_Allocator& allocator)
@@ -178,6 +178,22 @@ static void buildPositionRemap(unsigned int* remap, const float* vertex_position

 		remap[index] = *entry;
 	}
+
+	allocator.deallocate(vertex_table);
+}
+
+template <size_t BlockSize>
+static void remapVertices(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
+{
+	size_t block_size = BlockSize == 0 ? vertex_size : BlockSize;
+	assert(block_size == vertex_size);
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		if (remap[i] != ~0u)
+		{
+			assert(remap[i] < vertex_count);
+			memcpy(static_cast<unsigned char*>(destination) + remap[i] * block_size, static_cast<const unsigned char*>(vertices) + i * block_size, block_size);
+		}
 }

 } // namespace meshopt
@@ -288,6 +304,8 @@ size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigne

 void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
 {
+	using namespace meshopt;
+
 	assert(vertex_size > 0 && vertex_size <= 256);

 	meshopt_Allocator allocator;
@@ -300,14 +318,23 @@ void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t v
 		vertices = vertices_copy;
 	}

-	for (size_t i = 0; i < vertex_count; ++i)
+	// specialize the loop for common vertex sizes to ensure memcpy is compiled as an inlined intrinsic
+	switch (vertex_size)
 	{
-		if (remap[i] != ~0u)
-		{
-			assert(remap[i] < vertex_count);
+	case 4:
+		return remapVertices<4>(destination, vertices, vertex_count, vertex_size, remap);

-			memcpy(static_cast<unsigned char*>(destination) + remap[i] * vertex_size, static_cast<const unsigned char*>(vertices) + i * vertex_size, vertex_size);
-		}
+	case 8:
+		return remapVertices<8>(destination, vertices, vertex_count, vertex_size, remap);
+
+	case 12:
+		return remapVertices<12>(destination, vertices, vertex_count, vertex_size, remap);
+
+	case 16:
+		return remapVertices<16>(destination, vertices, vertex_count, vertex_size, remap);
+
+	default:
+		return remapVertices<0>(destination, vertices, vertex_count, vertex_size, remap);
 	}
 }

@@ -1,7 +1,7 @@
 /**
- * meshoptimizer - version 0.18
+ * meshoptimizer - version 0.20
 *
- * Copyright (C) 2016-2022, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Copyright (C) 2016-2023, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
 * Report bugs and download new versions at https://github.com/zeux/meshoptimizer
 *
 * This library is distributed under the MIT License. See notice at the end of this file.
@@ -12,7 +12,7 @@
 #include <stddef.h>

 /* Version macro; major * 1000 + minor * 10 + patch */
-#define MESHOPTIMIZER_VERSION 180 /* 0.18 */
+#define MESHOPTIMIZER_VERSION 200 /* 0.20 */

 /* If no API is defined, assume default */
 #ifndef MESHOPTIMIZER_API
@@ -67,6 +67,7 @@ MESHOPTIMIZER_API size_t meshopt_generateVertexRemap(unsigned int* destination,
 *
 * destination must contain enough space for the resulting remap table (vertex_count elements)
 * indices can be NULL if the input is unindexed
+ * stream_count must be <= 16
 */
 MESHOPTIMIZER_API size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);

@@ -103,6 +104,7 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destinati
 * Note that binary equivalence considers all size bytes in each stream, including padding which should be zero-initialized.
 *
 * destination must contain enough space for the resulting index buffer (index_count elements)
+ * stream_count must be <= 16
 */
 MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);

@@ -304,13 +306,22 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t cou
 * Input data must contain 4 floats for every quaternion (count*4 total).
 *
 * meshopt_encodeFilterExp encodes arbitrary (finite) floating-point data with 8-bit exponent and K-bit integer mantissa (1 <= K <= 24).
- * Mantissa is shared between all components of a given vector as defined by stride; stride must be divisible by 4.
+ * Exponent can be shared between all components of a given vector as defined by stride or all values of a given component; stride must be divisible by 4.
 * Input data must contain stride/4 floats for every vector (count*stride/4 total).
- * When individual (scalar) encoding is desired, simply pass stride=4 and adjust count accordingly.
 */
+enum meshopt_EncodeExpMode
+{
+    /* When encoding exponents, use separate values for each component (maximum quality) */
+    meshopt_EncodeExpSeparate,
+    /* When encoding exponents, use shared value for all components of each vector (better compression) */
+    meshopt_EncodeExpSharedVector,
+    /* When encoding exponents, use shared value for each component of all vectors (best compression) */
+    meshopt_EncodeExpSharedComponent,
+};
+
 MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data);
 MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data);
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data);
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode);

 /**
 * Simplification options
@@ -321,11 +332,6 @@ enum
    meshopt_SimplifyLockBorder = 1 << 0,
 };

-/**
- * Experimental: Mesh simplifier with attribute metric; attributes follow xyz position data atm (vertex data must contain 3 + attribute_count floats per vertex)
- */
-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_data, size_t vertex_count, size_t vertex_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error, const float* attributes, const float* attribute_weights, size_t attribute_count);
-
 /**
 * Mesh simplifier
 * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible
@@ -343,6 +349,18 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* d
 */
 MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error);

+/**
+ * Experimental: Mesh simplifier with attribute metric
+ * The algorithm ehnahces meshopt_simplify by incorporating attribute values into the error metric used to prioritize simplification order; see meshopt_simplify documentation for details.
+ * Note that the number of attributes affects memory requirements and running time; this algorithm requires ~1.5x more memory and time compared to meshopt_simplify when using 4 scalar attributes.
+ *
+ * vertex_attributes should have attribute_count floats for each vertex
+ * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position. The recommended weight range is [1e-3..1e-1], assuming attribute data is in [0..1] range.
+ * attribute_count must be <= 16
+ * TODO target_error/result_error currently use combined distance+attribute error; this may change in the future
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t target_index_count, float target_error, unsigned int options, float* result_error);
+
 /**
 * Experimental: Mesh simplifier (sloppy)
 * Reduces the number of triangles in the mesh, sacrificing mesh appearance for simplification performance
@@ -367,8 +385,9 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destinati
 *
 * destination must contain enough space for the target index buffer (target_vertex_count elements)
 * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * vertex_colors should can be NULL; when it's not NULL, it should have float3 color in the first 12 bytes of each vertex
 */
-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_vertex_count);
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count);

 /**
 * Returns the error scaling factor used by the simplifier to convert between absolute and relative extents
@@ -497,7 +516,7 @@ struct meshopt_Bounds
 * For backface culling with orthographic projection, use the following formula to reject backfacing clusters:
 *   dot(view, cone_axis) >= cone_cutoff
 *
- * For perspective projection, you can the formula that needs cone apex in addition to axis & cutoff:
+ * For perspective projection, you can use the formula that needs cone apex in addition to axis & cutoff:
 *   dot(normalize(cone_apex - camera_position), cone_axis) >= cone_cutoff
 *
 * Alternatively, you can use the formula that doesn't need cone apex and uses bounding sphere instead:
@@ -506,7 +525,8 @@ struct meshopt_Bounds
 *   dot(center - camera_position, cone_axis) >= cone_cutoff * length(center - camera_position) + radius
 *
 * The formula that uses the apex is slightly more accurate but needs the apex; if you are already using bounding sphere
- * to do frustum/occlusion culling, the formula that doesn't use the apex may be preferable.
+ * to do frustum/occlusion culling, the formula that doesn't use the apex may be preferable (for derivation see
+ * Real-Time Rendering 4th Edition, section 19.3).
 *
 * vertex_positions should have float3 position in the first 12 bytes of each vertex
 * index_count/3 should be less than or equal to 512 (the function assumes clusters of limited size)
@@ -515,13 +535,14 @@ MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeClusterBounds(const unsig
 MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);

 /**
- * Experimental: Spatial sorter
+ * Spatial sorter
 * Generates a remap table that can be used to reorder points for spatial locality.
 * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer.
 *
 * destination must contain enough space for the resulting remap table (vertex_count elements)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
 */
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+MESHOPTIMIZER_API void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);

 /**
 * Experimental: Spatial sorter
@@ -561,19 +582,25 @@ inline int meshopt_quantizeUnorm(float v, int N);
 inline int meshopt_quantizeSnorm(float v, int N);

 /**
- * Quantize a float into half-precision floating point value
+ * Quantize a float into half-precision (as defined by IEEE-754 fp16) floating point value
 * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
 * Representable magnitude range: [6e-5; 65504]
 * Maximum relative reconstruction error: 5e-4
 */
-inline unsigned short meshopt_quantizeHalf(float v);
+MESHOPTIMIZER_API unsigned short meshopt_quantizeHalf(float v);

 /**
- * Quantize a float into a floating point value with a limited number of significant mantissa bits
+ * Quantize a float into a floating point value with a limited number of significant mantissa bits, preserving the IEEE-754 fp32 binary representation
 * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
 * Assumes N is in a valid mantissa precision range, which is 1..23
 */
-inline float meshopt_quantizeFloat(float v, int N);
+MESHOPTIMIZER_API float meshopt_quantizeFloat(float v, int N);
+
+/**
+ * Reverse quantization of a half-precision (as defined by IEEE-754 fp16) floating point value
+ * Preserves Inf/NaN, flushes denormals to zero
+ */
+MESHOPTIMIZER_API float meshopt_dequantizeHalf(unsigned short h);
 #endif

 /**
@@ -620,9 +647,11 @@ inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_s
 template <typename T>
 inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size);
 template <typename T>
-inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = 0);
+inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
 template <typename T>
-inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error = 0);
+inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
+template <typename T>
+inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error = NULL);
 template <typename T>
 inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index);
 template <typename T>
@@ -666,50 +695,6 @@ inline int meshopt_quantizeSnorm(float v, int N)

 	return int(v * scale + round);
 }
-
-inline unsigned short meshopt_quantizeHalf(float v)
-{
-	union { float f; unsigned int ui; } u = {v};
-	unsigned int ui = u.ui;
-
-	int s = (ui >> 16) & 0x8000;
-	int em = ui & 0x7fffffff;
-
-	/* bias exponent and round to nearest; 112 is relative exponent bias (127-15) */
-	int h = (em - (112 << 23) + (1 << 12)) >> 13;
-
-	/* underflow: flush to zero; 113 encodes exponent -14 */
-	h = (em < (113 << 23)) ? 0 : h;
-
-	/* overflow: infinity; 143 encodes exponent 16 */
-	h = (em >= (143 << 23)) ? 0x7c00 : h;
-
-	/* NaN; note that we convert all types of NaN to qNaN */
-	h = (em > (255 << 23)) ? 0x7e00 : h;
-
-	return (unsigned short)(s | h);
-}
-
-inline float meshopt_quantizeFloat(float v, int N)
-{
-	union { float f; unsigned int ui; } u = {v};
-	unsigned int ui = u.ui;
-
-	const int mask = (1 << (23 - N)) - 1;
-	const int round = (1 << (23 - N)) >> 1;
-
-	int e = ui & 0x7f800000;
-	unsigned int rui = (ui + round) & ~mask;
-
-	/* round all numbers except inf/nan; this is important to make sure nan doesn't overflow into -0 */
-	ui = e == 0x7f800000 ? ui : rui;
-
-	/* flush denormals to zero */
-	ui = e == 0 ? 0 : ui;
-
-	u.ui = ui;
-	return u.f;
-}
 #endif

 /* Internal implementation helpers */
@@ -746,6 +731,13 @@ public:
 		return result;
 	}

+	void deallocate(void* ptr)
+	{
+		assert(count > 0 && blocks[count - 1] == ptr);
+		Storage::deallocate(ptr);
+		count--;
+	}
+
 private:
 	void* blocks[24];
 	size_t count;
@@ -770,7 +762,7 @@ struct meshopt_IndexAdapter<T, false>

 	meshopt_IndexAdapter(T* result_, const T* input, size_t count_)
 	    : result(result_)
-	    , data(0)
+	    , data(NULL)
 	    , count(count_)
 	{
 		size_t size = count > size_t(-1) / sizeof(unsigned int) ? size_t(-1) : count * sizeof(unsigned int);
@@ -810,33 +802,33 @@ struct meshopt_IndexAdapter<T, true>
 template <typename T>
 inline size_t meshopt_generateVertexRemap(unsigned int* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
 {
-	meshopt_IndexAdapter<T> in(0, indices, indices ? index_count : 0);
+	meshopt_IndexAdapter<T> in(NULL, indices, indices ? index_count : 0);

-	return meshopt_generateVertexRemap(destination, indices ? in.data : 0, index_count, vertices, vertex_count, vertex_size);
+	return meshopt_generateVertexRemap(destination, indices ? in.data : NULL, index_count, vertices, vertex_count, vertex_size);
 }

 template <typename T>
 inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count)
 {
-	meshopt_IndexAdapter<T> in(0, indices, indices ? index_count : 0);
+	meshopt_IndexAdapter<T> in(NULL, indices, indices ? index_count : 0);

-	return meshopt_generateVertexRemapMulti(destination, indices ? in.data : 0, index_count, vertex_count, streams, stream_count);
+	return meshopt_generateVertexRemapMulti(destination, indices ? in.data : NULL, index_count, vertex_count, streams, stream_count);
 }

 template <typename T>
 inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap)
 {
-	meshopt_IndexAdapter<T> in(0, indices, indices ? index_count : 0);
+	meshopt_IndexAdapter<T> in(NULL, indices, indices ? index_count : 0);
 	meshopt_IndexAdapter<T> out(destination, 0, index_count);

-	meshopt_remapIndexBuffer(out.data, indices ? in.data : 0, index_count, remap);
+	meshopt_remapIndexBuffer(out.data, indices ? in.data : NULL, index_count, remap);
 }

 template <typename T>
 inline void meshopt_generateShadowIndexBuffer(T* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride)
 {
-	meshopt_IndexAdapter<T> in(0, indices, index_count);
-	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);

 	meshopt_generateShadowIndexBuffer(out.data, in.data, index_count, vertices, vertex_count, vertex_size, vertex_stride);
 }
@@ -844,8 +836,8 @@ inline void meshopt_generateShadowIndexBuffer(T* destination, const T* indices,
 template <typename T>
 inline void meshopt_generateShadowIndexBufferMulti(T* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count)
 {
-	meshopt_IndexAdapter<T> in(0, indices, index_count);
-	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);

 	meshopt_generateShadowIndexBufferMulti(out.data, in.data, index_count, vertex_count, streams, stream_count);
 }
@@ -853,8 +845,8 @@ inline void meshopt_generateShadowIndexBufferMulti(T* destination, const T* indi
 template <typename T>
 inline void meshopt_generateAdjacencyIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
 {
-	meshopt_IndexAdapter<T> in(0, indices, index_count);
-	meshopt_IndexAdapter<T> out(destination, 0, index_count * 2);
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, NULL, index_count * 2);

 	meshopt_generateAdjacencyIndexBuffer(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
 }
@@ -862,8 +854,8 @@ inline void meshopt_generateAdjacencyIndexBuffer(T* destination, const T* indice
 template <typename T>
 inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
 {
-	meshopt_IndexAdapter<T> in(0, indices, index_count);
-	meshopt_IndexAdapter<T> out(destination, 0, index_count * 4);
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, NULL, index_count * 4);

 	meshopt_generateTessellationIndexBuffer(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
 }
@@ -871,8 +863,8 @@ inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* ind
 template <typename T>
 inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count)
 {
-	meshopt_IndexAdapter<T> in(0, indices, index_count);
-	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);

 	meshopt_optimizeVertexCache(out.data, in.data, index_count, vertex_count);
 }
@@ -880,8 +872,8 @@ inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t
 template <typename T>
 inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count)
 {
-	meshopt_IndexAdapter<T> in(0, indices, index_count);
-	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);

 	meshopt_optimizeVertexCacheStrip(out.data, in.data, index_count, vertex_count);
 }
@@ -889,8 +881,8 @@ inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, s
 template <typename T>
 inline void meshopt_optimizeVertexCacheFifo(T* destination, const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size)
 {
-	meshopt_IndexAdapter<T> in(0, indices, index_count);
-	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);

 	meshopt_optimizeVertexCacheFifo(out.data, in.data, index_count, vertex_count, cache_size);
 }
@@ -898,8 +890,8 @@ inline void meshopt_optimizeVertexCacheFifo(T* destination, const T* indices, si
 template <typename T>
 inline void meshopt_optimizeOverdraw(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold)
 {
-	meshopt_IndexAdapter<T> in(0, indices, index_count);
-	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);

 	meshopt_optimizeOverdraw(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, threshold);
 }
@@ -907,7 +899,7 @@ inline void meshopt_optimizeOverdraw(T* destination, const T* indices, size_t in
 template <typename T>
 inline size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count)
 {
-	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);

 	return meshopt_optimizeVertexFetchRemap(destination, in.data, index_count, vertex_count);
 }
@@ -923,7 +915,7 @@ inline size_t meshopt_optimizeVertexFetch(void* destination, T* indices, size_t
 template <typename T>
 inline size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count)
 {
-	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);

 	return meshopt_encodeIndexBuffer(buffer, buffer_size, in.data, index_count);
 }
@@ -940,7 +932,7 @@ inline int meshopt_decodeIndexBuffer(T* destination, size_t index_count, const u
 template <typename T>
 inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count)
 {
-	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);

 	return meshopt_encodeIndexSequence(buffer, buffer_size, in.data, index_count);
 }
@@ -957,17 +949,26 @@ inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const
 template <typename T>
 inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error)
 {
-	meshopt_IndexAdapter<T> in(0, indices, index_count);
-	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);

 	return meshopt_simplify(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error, options, result_error);
 }

+template <typename T>
+inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t target_index_count, float target_error, unsigned int options, float* result_error)
+{
+    meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+    meshopt_IndexAdapter<T> out(destination, NULL, index_count);
+
+    return meshopt_simplifyWithAttributes(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_attributes, vertex_attributes_stride, attribute_weights, attribute_count, target_index_count, target_error, options, result_error);
+}
+
 template <typename T>
 inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error)
 {
-	meshopt_IndexAdapter<T> in(0, indices, index_count);
-	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);

 	return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error, result_error);
 }
@@ -975,8 +976,8 @@ inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t in
 template <typename T>
 inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index)
 {
-	meshopt_IndexAdapter<T> in(0, indices, index_count);
-	meshopt_IndexAdapter<T> out(destination, 0, (index_count / 3) * 5);
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, NULL, (index_count / 3) * 5);

 	return meshopt_stripify(out.data, in.data, index_count, vertex_count, unsigned(restart_index));
 }
@@ -984,8 +985,8 @@ inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_co
 template <typename T>
 inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count, T restart_index)
 {
-	meshopt_IndexAdapter<T> in(0, indices, index_count);
-	meshopt_IndexAdapter<T> out(destination, 0, (index_count - 2) * 3);
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, NULL, (index_count - 2) * 3);

 	return meshopt_unstripify(out.data, in.data, index_count, unsigned(restart_index));
 }
@@ -993,7 +994,7 @@ inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_
 template <typename T>
 inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size)
 {
-	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);

 	return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, buffer_size);
 }
@@ -1001,7 +1002,7 @@ inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices
 template <typename T>
 inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
 {
-	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);

 	return meshopt_analyzeOverdraw(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
 }
@@ -1009,7 +1010,7 @@ inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size
 template <typename T>
 inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
 {
-	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);

 	return meshopt_analyzeVertexFetch(in.data, index_count, vertex_count, vertex_size);
 }
@@ -1017,7 +1018,7 @@ inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices
 template <typename T>
 inline size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
 {
-	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);

 	return meshopt_buildMeshlets(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, max_triangles, cone_weight);
 }
@@ -1025,7 +1026,7 @@ inline size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* mes
 template <typename T>
 inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
 {
-	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);

 	return meshopt_buildMeshletsScan(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_count, max_vertices, max_triangles);
 }
@@ -1033,7 +1034,7 @@ inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int*
 template <typename T>
 inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
 {
-	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);

 	return meshopt_computeClusterBounds(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
 }
@@ -1041,15 +1042,15 @@ inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t inde
 template <typename T>
 inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
 {
-	meshopt_IndexAdapter<T> in(0, indices, index_count);
-	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);

 	meshopt_spatialSortTriangles(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
 }
 #endif

 /**
- * Copyright (c) 2016-2022 Arseny Kapoulkine
+ * Copyright (c) 2016-2023 Arseny Kapoulkine
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
@@ -1,176 +0,0 @@
-diff --git a/thirdparty/meshoptimizer/simplifier.cpp b/thirdparty/meshoptimizer/simplifier.cpp
-index d8d4a67391..3847afc736 100644
--- a/thirdparty/meshoptimizer/simplifier.cpp
-+++ b/thirdparty/meshoptimizer/simplifier.cpp
-@@ -20,7 +20,7 @@
- #define TRACESTATS(i) (void)0
- #endif
- 
-#define ATTRIBUTES 8
-+#define ATTRIBUTES 3
- 
- // This work is based on:
- // Michael Garland and Paul S. Heckbert. Surface simplification using quadric error metrics. 1997
-@@ -458,6 +458,7 @@ struct Collapse
- 		float error;
- 		unsigned int errorui;
- 	};
-+	float distance_error;
- };
- 
- static float normalize(Vector3& v)
-@@ -538,6 +539,34 @@ static float quadricError(const Quadric& Q, const Vector3& v)
- 	return fabsf(r) * s;
- }
- 
-+static float quadricErrorNoAttributes(const Quadric& Q, const Vector3& v)
-+{
-+	float rx = Q.b0;
-+	float ry = Q.b1;
-+	float rz = Q.b2;
-+
-+	rx += Q.a10 * v.y;
-+	ry += Q.a21 * v.z;
-+	rz += Q.a20 * v.x;
-+
-+	rx *= 2;
-+	ry *= 2;
-+	rz *= 2;
-+
-+	rx += Q.a00 * v.x;
-+	ry += Q.a11 * v.y;
-+	rz += Q.a22 * v.z;
-+
-+	float r = Q.c;
-+	r += rx * v.x;
-+	r += ry * v.y;
-+	r += rz * v.z;
-+
-+	float s = Q.w == 0.f ? 0.f : 1.f / Q.w;
-+
-+	return fabsf(r) * s;
-+}
-+
- static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, float w)
- {
- 	float aw = a * w;
-@@ -693,7 +722,7 @@ static void quadricUpdateAttributes(Quadric& Q, const Vector3& p0, const Vector3
- }
- #endif
- 
-static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap)
-+static void fillFaceQuadrics(Quadric* vertex_quadrics, Quadric* vertex_no_attrib_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap)
- {
- 	for (size_t i = 0; i < index_count; i += 3)
- 	{
-@@ -703,6 +732,9 @@ static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indic
- 
- 		Quadric Q;
- 		quadricFromTriangle(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], 1.f);
-+		quadricAdd(vertex_no_attrib_quadrics[remap[i0]], Q);
-+		quadricAdd(vertex_no_attrib_quadrics[remap[i1]], Q);
-+		quadricAdd(vertex_no_attrib_quadrics[remap[i2]], Q);
- 
- #if ATTRIBUTES
- 		quadricUpdateAttributes(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], Q.w);
-@@ -713,7 +745,7 @@ static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indic
- 	}
- }
- 
-static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback)
-+static void fillEdgeQuadrics(Quadric* vertex_quadrics, Quadric* vertex_no_attrib_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback)
- {
- 	for (size_t i = 0; i < index_count; i += 3)
- 	{
-@@ -757,6 +789,9 @@ static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indic
- 
- 			quadricAdd(vertex_quadrics[remap[i0]], Q);
- 			quadricAdd(vertex_quadrics[remap[i1]], Q);
-+
-+			quadricAdd(vertex_no_attrib_quadrics[remap[i0]], Q);
-+			quadricAdd(vertex_no_attrib_quadrics[remap[i1]], Q);
- 		}
- 	}
- }
-@@ -861,7 +896,7 @@ static size_t pickEdgeCollapses(Collapse* collapses, const unsigned int* indices
- 	return collapse_count;
- }
- 
-static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const Quadric* vertex_quadrics, const unsigned int* remap)
-+static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const Quadric* vertex_quadrics, const Quadric* vertex_no_attrib_quadrics, const unsigned int* remap)
- {
- 	for (size_t i = 0; i < collapse_count; ++i)
- 	{
-@@ -881,10 +916,14 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const
- 		float ei = quadricError(qi, vertex_positions[i1]);
- 		float ej = quadricError(qj, vertex_positions[j1]);
- 
-+		const Quadric& naqi = vertex_no_attrib_quadrics[remap[i0]];
-+		const Quadric& naqj = vertex_no_attrib_quadrics[remap[j0]];
-+
- 		// pick edge direction with minimal error
- 		c.v0 = ei <= ej ? i0 : j0;
- 		c.v1 = ei <= ej ? i1 : j1;
- 		c.error = ei <= ej ? ei : ej;
-+		c.distance_error = ei <= ej ? quadricErrorNoAttributes(naqi, vertex_positions[i1]) :  quadricErrorNoAttributes(naqj, vertex_positions[j1]);
- 	}
- }
- 
-@@ -981,7 +1020,7 @@ static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapse
- 	}
- }
- 
-static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* collapse_locked, Quadric* vertex_quadrics, const Collapse* collapses, size_t collapse_count, const unsigned int* collapse_order, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const Vector3* vertex_positions, const EdgeAdjacency& adjacency, size_t triangle_collapse_goal, float error_limit, float& result_error)
-+static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* collapse_locked, Quadric* vertex_quadrics, Quadric* vertex_no_attrib_quadrics, const Collapse* collapses, size_t collapse_count, const unsigned int* collapse_order, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const Vector3* vertex_positions, const EdgeAdjacency& adjacency, size_t triangle_collapse_goal, float error_limit, float& result_error)
- {
- 	size_t edge_collapses = 0;
- 	size_t triangle_collapses = 0;
-@@ -1043,6 +1082,7 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
- 		assert(collapse_remap[r1] == r1);
- 
- 		quadricAdd(vertex_quadrics[r1], vertex_quadrics[r0]);
-+		quadricAdd(vertex_no_attrib_quadrics[r1], vertex_no_attrib_quadrics[r0]);
- 
- 		if (vertex_kind[i0] == Kind_Complex)
- 		{
-@@ -1080,7 +1120,7 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
- 		triangle_collapses += (vertex_kind[i0] == Kind_Border) ? 1 : 2;
- 		edge_collapses++;
- 
-		result_error = result_error < c.error ? c.error : result_error;
-+		result_error = result_error < c.distance_error ? c.distance_error : result_error;
- 	}
- 
- #if TRACE
-@@ -1469,9 +1509,11 @@ size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned
- 
- 	Quadric* vertex_quadrics = allocator.allocate<Quadric>(vertex_count);
- 	memset(vertex_quadrics, 0, vertex_count * sizeof(Quadric));
-+	Quadric* vertex_no_attrib_quadrics = allocator.allocate<Quadric>(vertex_count);
-+	memset(vertex_no_attrib_quadrics, 0, vertex_count * sizeof(Quadric));
- 
-	fillFaceQuadrics(vertex_quadrics, indices, index_count, vertex_positions, remap);
-	fillEdgeQuadrics(vertex_quadrics, indices, index_count, vertex_positions, remap, vertex_kind, loop, loopback);
-+	fillFaceQuadrics(vertex_quadrics, vertex_no_attrib_quadrics, indices, index_count, vertex_positions, remap);
-+	fillEdgeQuadrics(vertex_quadrics, vertex_no_attrib_quadrics, indices, index_count, vertex_positions, remap, vertex_kind, loop, loopback);
- 
- 	if (result != indices)
- 		memcpy(result, indices, index_count * sizeof(unsigned int));
-@@ -1502,7 +1544,7 @@ size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned
- 		if (edge_collapse_count == 0)
- 			break;
- 
-		rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_quadrics, remap);
-+		rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_quadrics, vertex_no_attrib_quadrics, remap);
- 
- #if TRACE > 1
- 		dumpEdgeCollapses(edge_collapses, edge_collapse_count, vertex_kind);
-@@ -1521,7 +1563,7 @@ size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned
- 		printf("pass %d: ", int(pass_count++));
- #endif
- 
-		size_t collapses = performEdgeCollapses(collapse_remap, collapse_locked, vertex_quadrics, edge_collapses, edge_collapse_count, collapse_order, remap, wedge, vertex_kind, vertex_positions, adjacency, triangle_collapse_goal, error_limit, result_error);
-+		size_t collapses = performEdgeCollapses(collapse_remap, collapse_locked, vertex_quadrics, vertex_no_attrib_quadrics, edge_collapses, edge_collapse_count, collapse_order, remap, wedge, vertex_kind, vertex_positions, adjacency, triangle_collapse_goal, error_limit, result_error);
- 
- 		// no edges can be collapsed any more due to hitting the error limit or triangle collapse limit
- 		if (collapses == 0)
@@ -1,263 +0,0 @@
-diff --git a/thirdparty/meshoptimizer/meshoptimizer.h b/thirdparty/meshoptimizer/meshoptimizer.h
-index d95725dd71..46d28d3ea3 100644
--- a/thirdparty/meshoptimizer/meshoptimizer.h
-+++ b/thirdparty/meshoptimizer/meshoptimizer.h
-@@ -321,6 +321,11 @@ enum
-     meshopt_SimplifyLockBorder = 1 << 0,
- };
- 
-+/**
-+ * Experimental: Mesh simplifier with attribute metric; attributes follow xyz position data atm (vertex data must contain 3 + attribute_count floats per vertex)
-+ */
-+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_data, size_t vertex_count, size_t vertex_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error, const float* attributes, const float* attribute_weights, size_t attribute_count);
-+
- /**
-  * Mesh simplifier
-  * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible
-diff --git a/thirdparty/meshoptimizer/simplifier.cpp b/thirdparty/meshoptimizer/simplifier.cpp
-index 5f0e9bac31..797329b010 100644
--- a/thirdparty/meshoptimizer/simplifier.cpp
-+++ b/thirdparty/meshoptimizer/simplifier.cpp
-@@ -20,6 +20,8 @@
- #define TRACESTATS(i) (void)0
- #endif
- 
-+#define ATTRIBUTES 8
-+
- // This work is based on:
- // Michael Garland and Paul S. Heckbert. Surface simplification using quadric error metrics. 1997
- // Michael Garland. Quadric-based polygonal surface simplification. 1999
-@@ -376,6 +378,10 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned
- struct Vector3
- {
- 	float x, y, z;
-+
-+#if ATTRIBUTES
-+	float a[ATTRIBUTES];
-+#endif
- };
- 
- static float rescalePositions(Vector3* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride)
-@@ -432,6 +438,13 @@ struct Quadric
- 	float a10, a20, a21;
- 	float b0, b1, b2, c;
- 	float w;
-+
-+#if ATTRIBUTES
-+	float gx[ATTRIBUTES];
-+	float gy[ATTRIBUTES];
-+	float gz[ATTRIBUTES];
-+	float gw[ATTRIBUTES];
-+#endif
- };
- 
- struct Collapse
-@@ -474,6 +487,16 @@ static void quadricAdd(Quadric& Q, const Quadric& R)
- 	Q.b2 += R.b2;
- 	Q.c += R.c;
- 	Q.w += R.w;
-+
-+#if ATTRIBUTES
-+	for (int k = 0; k < ATTRIBUTES; ++k)
-+	{
-+		Q.gx[k] += R.gx[k];
-+		Q.gy[k] += R.gy[k];
-+		Q.gz[k] += R.gz[k];
-+		Q.gw[k] += R.gw[k];
-+	}
-+#endif
- }
- 
- static float quadricError(const Quadric& Q, const Vector3& v)
-@@ -499,6 +522,17 @@ static float quadricError(const Quadric& Q, const Vector3& v)
- 	r += ry * v.y;
- 	r += rz * v.z;
- 
-+#if ATTRIBUTES
-+	// see quadricUpdateAttributes for general derivation; here we need to add the parts of (eval(pos) - attr)^2 that depend on attr
-+	for (int k = 0; k < ATTRIBUTES; ++k)
-+	{
-+		float a = v.a[k];
-+
-+		r += a * a * Q.w;
-+		r -= 2 * a * (v.x * Q.gx[k] + v.y * Q.gy[k] + v.z * Q.gz[k] + Q.gw[k]);
-+	}
-+#endif
-+
- 	float s = Q.w == 0.f ? 0.f : 1.f / Q.w;
- 
- 	return fabsf(r) * s;
-@@ -522,6 +556,13 @@ static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, flo
- 	Q.b2 = c * dw;
- 	Q.c = d * dw;
- 	Q.w = w;
-+
-+#if ATTRIBUTES
-+	memset(Q.gx, 0, sizeof(Q.gx));
-+	memset(Q.gy, 0, sizeof(Q.gy));
-+	memset(Q.gz, 0, sizeof(Q.gz));
-+	memset(Q.gw, 0, sizeof(Q.gw));
-+#endif
- }
- 
- static void quadricFromPoint(Quadric& Q, float x, float y, float z, float w)
-@@ -574,6 +615,84 @@ static void quadricFromTriangleEdge(Quadric& Q, const Vector3& p0, const Vector3
- 	quadricFromPlane(Q, normal.x, normal.y, normal.z, -distance, length * weight);
- }
- 
-+#if ATTRIBUTES
-+static void quadricUpdateAttributes(Quadric& Q, const Vector3& p0, const Vector3& p1, const Vector3& p2, float w)
-+{
-+	// for each attribute we want to encode the following function into the quadric:
-+	// (eval(pos) - attr)^2
-+	// where eval(pos) interpolates attribute across the triangle like so:
-+	// eval(pos) = pos.x * gx + pos.y * gy + pos.z * gz + gw
-+	// where gx/gy/gz/gw are gradients
-+	Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
-+	Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z};
-+
-+	// we compute gradients using barycentric coordinates; barycentric coordinates can be computed as follows:
-+	// v = (d11 * d20 - d01 * d21) / denom
-+	// w = (d00 * d21 - d01 * d20) / denom
-+	// u = 1 - v - w
-+	// here v0, v1 are triangle edge vectors, v2 is a vector from point to triangle corner, and dij = dot(vi, vj)
-+	const Vector3& v0 = p10;
-+	const Vector3& v1 = p20;
-+	float d00 = v0.x * v0.x + v0.y * v0.y + v0.z * v0.z;
-+	float d01 = v0.x * v1.x + v0.y * v1.y + v0.z * v1.z;
-+	float d11 = v1.x * v1.x + v1.y * v1.y + v1.z * v1.z;
-+	float denom = d00 * d11 - d01 * d01;
-+	float denomr = denom == 0 ? 0.f : 1.f / denom;
-+
-+	// precompute gradient factors
-+	// these are derived by directly computing derivative of eval(pos) = a0 * u + a1 * v + a2 * w and factoring out common factors that are shared between attributes
-+	float gx1 = (d11 * v0.x - d01 * v1.x) * denomr;
-+	float gx2 = (d00 * v1.x - d01 * v0.x) * denomr;
-+	float gy1 = (d11 * v0.y - d01 * v1.y) * denomr;
-+	float gy2 = (d00 * v1.y - d01 * v0.y) * denomr;
-+	float gz1 = (d11 * v0.z - d01 * v1.z) * denomr;
-+	float gz2 = (d00 * v1.z - d01 * v0.z) * denomr;
-+
-+	for (int k = 0; k < ATTRIBUTES; ++k)
-+	{
-+		float a0 = p0.a[k], a1 = p1.a[k], a2 = p2.a[k];
-+
-+		// compute gradient of eval(pos) for x/y/z/w
-+		// the formulas below are obtained by directly computing derivative of eval(pos) = a0 * u + a1 * v + a2 * w
-+		float gx = gx1 * (a1 - a0) + gx2 * (a2 - a0);
-+		float gy = gy1 * (a1 - a0) + gy2 * (a2 - a0);
-+		float gz = gz1 * (a1 - a0) + gz2 * (a2 - a0);
-+		float gw = a0 - p0.x * gx - p0.y * gy - p0.z * gz;
-+
-+		// quadric encodes (eval(pos)-attr)^2; this means that the resulting expansion needs to compute, for example, pos.x * pos.y * K
-+		// since quadrics already encode factors for pos.x * pos.y, we can accumulate almost everything in basic quadric fields
-+		Q.a00 += w * (gx * gx);
-+		Q.a11 += w * (gy * gy);
-+		Q.a22 += w * (gz * gz);
-+
-+		Q.a10 += w * (gy * gx);
-+		Q.a20 += w * (gz * gx);
-+		Q.a21 += w * (gz * gy);
-+
-+		Q.b0 += w * (gx * gw);
-+		Q.b1 += w * (gy * gw);
-+		Q.b2 += w * (gz * gw);
-+
-+		Q.c += w * (gw * gw);
-+
-+		// the only remaining sum components are ones that depend on attr; these will be addded during error evaluation, see quadricError
-+		Q.gx[k] = w * gx;
-+		Q.gy[k] = w * gy;
-+		Q.gz[k] = w * gz;
-+		Q.gw[k] = w * gw;
-+
-+#if TRACE > 2
-+		printf("attr%d: %e %e %e\n",
-+			k,
-+			(gx * p0.x + gy * p0.y + gz * p0.z + gw - a0),
-+			(gx * p1.x + gy * p1.y + gz * p1.z + gw - a1),
-+			(gx * p2.x + gy * p2.y + gz * p2.z + gw - a2)
-+			);
-+#endif
-+	}
-+}
-+#endif
-+
- static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap)
- {
- 	for (size_t i = 0; i < index_count; i += 3)
-@@ -585,6 +704,9 @@ static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indic
- 		Quadric Q;
- 		quadricFromTriangle(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], 1.f);
- 
-+#if ATTRIBUTES
-+		quadricUpdateAttributes(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], Q.w);
-+#endif
- 		quadricAdd(vertex_quadrics[remap[i0]], Q);
- 		quadricAdd(vertex_quadrics[remap[i1]], Q);
- 		quadricAdd(vertex_quadrics[remap[i2]], Q);
-@@ -1278,14 +1400,20 @@ MESHOPTIMIZER_API unsigned int* meshopt_simplifyDebugLoopBack = 0;
- #endif
- 
- size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* out_result_error)
-+{
-+	return meshopt_simplifyWithAttributes(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, target_index_count, target_error, options, out_result_error, 0, 0, 0);
-+}
-+
-+size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_data, size_t vertex_count, size_t vertex_stride, size_t target_index_count, float target_error, unsigned int options, float* out_result_error, const float* attributes, const float* attribute_weights, size_t attribute_count)
- {
- 	using namespace meshopt;
- 
- 	assert(index_count % 3 == 0);
-	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
-	assert(vertex_positions_stride % sizeof(float) == 0);
-+	assert(vertex_stride >= 12 && vertex_stride <= 256);
-+	assert(vertex_stride % sizeof(float) == 0);
- 	assert(target_index_count <= index_count);
- 	assert((options & ~(meshopt_SimplifyLockBorder)) == 0);
-+	assert(attribute_count <= ATTRIBUTES);
- 
- 	meshopt_Allocator allocator;
- 
-@@ -1299,7 +1427,7 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
- 	// build position remap that maps each vertex to the one with identical position
- 	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
- 	unsigned int* wedge = allocator.allocate<unsigned int>(vertex_count);
-	buildPositionRemap(remap, wedge, vertex_positions_data, vertex_count, vertex_positions_stride, allocator);
-+	buildPositionRemap(remap, wedge, vertex_data, vertex_count, vertex_stride, allocator);
- 
- 	// classify vertices; vertex kind determines collapse rules, see kCanCollapse
- 	unsigned char* vertex_kind = allocator.allocate<unsigned char>(vertex_count);
-@@ -1323,7 +1451,21 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
- #endif
- 
- 	Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count);
-	rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride);
-+	rescalePositions(vertex_positions, vertex_data, vertex_count, vertex_stride);
-+
-+#if ATTRIBUTES
-+	for (size_t i = 0; i < vertex_count; ++i)
-+	{
-+		memset(vertex_positions[i].a, 0, sizeof(vertex_positions[i].a));
-+
-+		for (size_t k = 0; k < attribute_count; ++k)
-+		{
-+			float a = attributes[i * attribute_count + k];
-+
-+			vertex_positions[i].a[k] = a * attribute_weights[k];
-+		}
-+	}
-+#endif
- 
- 	Quadric* vertex_quadrics = allocator.allocate<Quadric>(vertex_count);
- 	memset(vertex_quadrics, 0, vertex_count * sizeof(Quadric));
-@@ -1415,7 +1557,9 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
- 
- 	// result_error is quadratic; we need to remap it back to linear
- 	if (out_result_error)
-+	{
- 		*out_result_error = sqrtf(result_error);
-+	}
- 
- 	return result_count;
- }
@@ -0,0 +1,39 @@
+diff --git a/thirdparty/meshoptimizer/simplifier.cpp b/thirdparty/meshoptimizer/simplifier.cpp
+index 5ba8570076..6f8b0e520e 100644
+--- a/thirdparty/meshoptimizer/simplifier.cpp
+++ b/thirdparty/meshoptimizer/simplifier.cpp
+@@ -476,6 +476,8 @@ struct Collapse
+ 		float error;
+ 		unsigned int errorui;
+ 	};
+
+	float distance_error;
+ };
+ 
+ static float normalize(Vector3& v)
+@@ -941,6 +943,8 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const
+ 		float ei = quadricError(vertex_quadrics[remap[i0]], vertex_positions[i1]);
+ 		float ej = quadricError(vertex_quadrics[remap[j0]], vertex_positions[j1]);
+ 
+		float dei = ei, dej = ej;
+
+ 		if (attribute_count)
+ 		{
+ 			ei += quadricError(attribute_quadrics[remap[i0]], &attribute_gradients[remap[i0] * attribute_count], attribute_count, vertex_positions[i1], &vertex_attributes[i1 * attribute_count]);
+@@ -951,6 +955,7 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const
+ 		c.v0 = ei <= ej ? i0 : j0;
+ 		c.v1 = ei <= ej ? i1 : j1;
+ 		c.error = ei <= ej ? ei : ej;
+		c.distance_error = ei <= ej ? dei : dej;
+ 	}
+ }
+ 
+@@ -1097,7 +1102,7 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
+ 		triangle_collapses += (vertex_kind[i0] == Kind_Border) ? 1 : 2;
+ 		edge_collapses++;
+ 
+-		result_error = result_error < c.error ? c.error : result_error;
+		result_error = result_error < c.distance_error ? c.distance_error : result_error;
+ 	}
+ 
+ #if TRACE
@@ -0,0 +1,70 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+
+unsigned short meshopt_quantizeHalf(float v)
+{
+	union { float f; unsigned int ui; } u = {v};
+	unsigned int ui = u.ui;
+
+	int s = (ui >> 16) & 0x8000;
+	int em = ui & 0x7fffffff;
+
+	// bias exponent and round to nearest; 112 is relative exponent bias (127-15)
+	int h = (em - (112 << 23) + (1 << 12)) >> 13;
+
+	// underflow: flush to zero; 113 encodes exponent -14
+	h = (em < (113 << 23)) ? 0 : h;
+
+	// overflow: infinity; 143 encodes exponent 16
+	h = (em >= (143 << 23)) ? 0x7c00 : h;
+
+	// NaN; note that we convert all types of NaN to qNaN
+	h = (em > (255 << 23)) ? 0x7e00 : h;
+
+	return (unsigned short)(s | h);
+}
+
+float meshopt_quantizeFloat(float v, int N)
+{
+	assert(N >= 0 && N <= 23);
+
+	union { float f; unsigned int ui; } u = {v};
+	unsigned int ui = u.ui;
+
+	const int mask = (1 << (23 - N)) - 1;
+	const int round = (1 << (23 - N)) >> 1;
+
+	int e = ui & 0x7f800000;
+	unsigned int rui = (ui + round) & ~mask;
+
+	// round all numbers except inf/nan; this is important to make sure nan doesn't overflow into -0
+	ui = e == 0x7f800000 ? ui : rui;
+
+	// flush denormals to zero
+	ui = e == 0 ? 0 : ui;
+
+	u.ui = ui;
+	return u.f;
+}
+
+float meshopt_dequantizeHalf(unsigned short h)
+{
+	unsigned int s = unsigned(h & 0x8000) << 16;
+	int em = h & 0x7fff;
+
+	// bias exponent and pad mantissa with 0; 112 is relative exponent bias (127-15)
+	int r = (em + (112 << 10)) << 13;
+
+	// denormal: flush to zero
+	r = (em < (1 << 10)) ? 0 : r;
+
+	// infinity/NaN; note that we preserve NaN payload as a byproduct of unifying inf/nan cases
+	// 112 is an exponent bias fixup; since we already applied it once, applying it twice converts 31 to 255
+	r += (em >= (31 << 10)) ? (112 << 23) : 0;
+
+	union { float f; unsigned int ui; } u;
+	u.ui = s | r;
+	return u.f;
+}
@@ -221,9 +221,9 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
 		triangle_scores[i] = vertex_scores[a] + vertex_scores[b] + vertex_scores[c];
 	}

-	unsigned int cache_holder[2 * (kCacheSizeMax + 3)];
+	unsigned int cache_holder[2 * (kCacheSizeMax + 4)];
 	unsigned int* cache = cache_holder;
-	unsigned int* cache_new = cache_holder + kCacheSizeMax + 3;
+	unsigned int* cache_new = cache_holder + kCacheSizeMax + 4;
 	size_t cache_count = 0;

 	unsigned int current_triangle = 0;
@@ -260,10 +260,8 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
 		{
 			unsigned int index = cache[i];

-			if (index != a && index != b && index != c)
-			{
-				cache_new[cache_write++] = index;
-			}
+			cache_new[cache_write] = index;
+			cache_write += (index != a && index != b && index != c);
 		}

 		unsigned int* cache_temp = cache;
@@ -305,6 +303,10 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
 		{
 			unsigned int index = cache[i];

+			// no need to update scores if we are never going to use this vertex
+			if (adjacency.counts[index] == 0)
+				continue;
+
 			int cache_position = i >= cache_size ? -1 : int(i);

 			// update vertex score
@@ -325,11 +327,8 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
 				float tri_score = triangle_scores[tri] + score_diff;
 				assert(tri_score > 0);

-				if (best_score < tri_score)
-				{
-					best_triangle = tri;
-					best_score = tri_score;
-				}
+				best_triangle = best_score < tri_score ? tri : best_triangle;
+				best_score = best_score < tri_score ? tri_score : best_score;

 				triangle_scores[tri] = tri_score;
 			}
@@ -44,6 +44,10 @@
 // When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
 #if defined(__wasm_simd128__)
 #define SIMD_WASM
+// Prevent compiling other variant when wasm simd compilation is active
+#undef SIMD_NEON
+#undef SIMD_SSE
+#undef SIMD_AVX
 #endif

 #ifndef SIMD_TARGET
@@ -83,19 +87,17 @@
 #endif

 #ifdef SIMD_WASM
-#undef __DEPRECATED
-#pragma clang diagnostic ignored "-Wdeprecated-declarations"
 #include <wasm_simd128.h>
 #endif

 #ifdef SIMD_WASM
-#define wasmx_splat_v32x4(v, i) wasm_v32x4_shuffle(v, v, i, i, i, i)
-#define wasmx_unpacklo_v8x16(a, b) wasm_v8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
-#define wasmx_unpackhi_v8x16(a, b) wasm_v8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
-#define wasmx_unpacklo_v16x8(a, b) wasm_v16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
-#define wasmx_unpackhi_v16x8(a, b) wasm_v16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
-#define wasmx_unpacklo_v64x2(a, b) wasm_v64x2_shuffle(a, b, 0, 2)
-#define wasmx_unpackhi_v64x2(a, b) wasm_v64x2_shuffle(a, b, 1, 3)
+#define wasmx_splat_v32x4(v, i) wasm_i32x4_shuffle(v, v, i, i, i, i)
+#define wasmx_unpacklo_v8x16(a, b) wasm_i8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
+#define wasmx_unpackhi_v8x16(a, b) wasm_i8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
+#define wasmx_unpacklo_v16x8(a, b) wasm_i16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
+#define wasmx_unpackhi_v16x8(a, b) wasm_i16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
+#define wasmx_unpacklo_v64x2(a, b) wasm_i64x2_shuffle(a, b, 0, 2)
+#define wasmx_unpackhi_v64x2(a, b) wasm_i64x2_shuffle(a, b, 1, 3)
 #endif

 namespace meshopt
@@ -218,7 +220,7 @@ static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end,
 	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;

 	if (size_t(data_end - data) < header_size)
-		return 0;
+		return NULL;

 	data += header_size;

@@ -227,7 +229,7 @@ static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end,
 	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
 	{
 		if (size_t(data_end - data) < kByteGroupDecodeLimit)
-			return 0;
+			return NULL;

 		int best_bits = 8;
 		size_t best_size = encodeBytesGroupMeasure(buffer + i, 8);
@@ -286,7 +288,7 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data

 		data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1));
 		if (!data)
-			return 0;
+			return NULL;
 	}

 	memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
@@ -294,7 +296,7 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data
 	return data;
 }

-#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX))
+#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX) && !defined(SIMD_WASM))
 static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
 {
 #define READ() byte = *data++
@@ -354,14 +356,14 @@ static const unsigned char* decodeBytes(const unsigned char* data, const unsigne
 	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;

 	if (size_t(data_end - data) < header_size)
-		return 0;
+		return NULL;

 	data += header_size;

 	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
 	{
 		if (size_t(data_end - data) < kByteGroupDecodeLimit)
-			return 0;
+			return NULL;

 		size_t header_offset = i / kByteGroupSize;

@@ -386,7 +388,7 @@ static const unsigned char* decodeVertexBlock(const unsigned char* data, const u
 	{
 		data = decodeBytes(data, data_end, buffer, vertex_count_aligned);
 		if (!data)
-			return 0;
+			return NULL;

 		size_t vertex_offset = k;

@@ -757,7 +759,7 @@ static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
 	v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);

 	v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]);
-	sm1off = wasm_v8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+	sm1off = wasm_i8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);

 	v128_t sm1r = wasm_i8x16_add(sm1, sm1off);

@@ -777,9 +779,6 @@ static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1
 SIMD_TARGET
 static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
 {
-	unsigned char byte, enc, encv;
-	const unsigned char* data_var;
-
 	switch (bitslog2)
 	{
 	case 0:
@@ -807,7 +806,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi

 		v128_t shuf = decodeShuffleMask(mask0, mask1);

-		v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask);
+		v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);

 		wasm_v128_store(buffer, result);

@@ -829,7 +828,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi

 		v128_t shuf = decodeShuffleMask(mask0, mask1);

-		v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask);
+		v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);

 		wasm_v128_store(buffer, result);

@@ -939,7 +938,7 @@ static const unsigned char* decodeBytesSimd(const unsigned char* data, const uns
 	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;

 	if (size_t(data_end - data) < header_size)
-		return 0;
+		return NULL;

 	data += header_size;

@@ -961,7 +960,7 @@ static const unsigned char* decodeBytesSimd(const unsigned char* data, const uns
 	for (; i < buffer_size; i += kByteGroupSize)
 	{
 		if (size_t(data_end - data) < kByteGroupDecodeLimit)
-			return 0;
+			return NULL;

 		size_t header_offset = i / kByteGroupSize;

@@ -989,7 +988,7 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con
 		{
 			data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned);
 			if (!data)
-				return 0;
+				return NULL;
 		}

 #if defined(SIMD_SSE) || defined(SIMD_AVX)
@@ -1183,7 +1182,7 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
 	assert(vertex_size > 0 && vertex_size <= 256);
 	assert(vertex_size % 4 == 0);

-	const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = 0;
+	const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = NULL;

 #if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
 	decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
@@ -30,6 +30,9 @@
 // When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
 #if defined(__wasm_simd128__)
 #define SIMD_WASM
+// Prevent compiling other variant when wasm simd compilation is active
+#undef SIMD_NEON
+#undef SIMD_SSE
 #endif

 #endif // !MESHOPTIMIZER_NO_SIMD
@@ -63,6 +66,10 @@
 #define wasmx_unziphi_v32x4(a, b) wasm_v32x4_shuffle(a, b, 1, 3, 5, 7)
 #endif

+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
 namespace meshopt
 {

@@ -185,9 +192,7 @@ inline uint64_t rotateleft64(uint64_t v, int x)
 {
 #if defined(_MSC_VER) && !defined(__clang__)
 	return _rotl64(v, x);
-// Apple's Clang 8 is actually vanilla Clang 3.9, there we need to look for
-// version 11 instead: https://en.wikipedia.org/wiki/Xcode#Toolchain_versions
-#elif defined(__clang__) && ((!defined(__apple_build_version__) && __clang_major__ >= 8) || __clang_major__ >= 11)
+#elif defined(__clang__) && __has_builtin(__builtin_rotateleft64)
 	return __builtin_rotateleft64(v, x);
 #else
 	return (v << (x & 63)) | (v >> ((64 - x) & 63));
@@ -791,6 +796,33 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count)
 }
 #endif

+// optimized variant of frexp
+inline int optlog2(float v)
+{
+	union
+	{
+		float f;
+		unsigned int ui;
+	} u;
+
+	u.f = v;
+	// +1 accounts for implicit 1. in mantissa; denormalized numbers will end up clamped to min_exp by calling code
+	return u.ui == 0 ? 0 : int((u.ui >> 23) & 0xff) - 127 + 1;
+}
+
+// optimized variant of ldexp
+inline float optexp2(int e)
+{
+	union
+	{
+		float f;
+		unsigned int ui;
+	} u;
+
+	u.ui = unsigned(e + 127) << 23;
+	return u.f;
+}
+
 } // namespace meshopt

 void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride)
@@ -918,39 +950,78 @@ void meshopt_encodeFilterQuat(void* destination_, size_t count, size_t stride, i
 	}
 }

-void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, int bits, const float* data)
+void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode)
 {
-	assert(stride > 0 && stride % 4 == 0);
+	using namespace meshopt;
+
+	assert(stride > 0 && stride % 4 == 0 && stride <= 256);
 	assert(bits >= 1 && bits <= 24);

 	unsigned int* destination = static_cast<unsigned int*>(destination_);
 	size_t stride_float = stride / sizeof(float);

+	int component_exp[64];
+	assert(stride_float <= sizeof(component_exp) / sizeof(int));
+
+	const int min_exp = -100;
+
+	if (mode == meshopt_EncodeExpSharedComponent)
+	{
+		for (size_t j = 0; j < stride_float; ++j)
+			component_exp[j] = min_exp;
+
+		for (size_t i = 0; i < count; ++i)
+		{
+			const float* v = &data[i * stride_float];
+
+			// use maximum exponent to encode values; this guarantees that mantissa is [-1, 1]
+			for (size_t j = 0; j < stride_float; ++j)
+			{
+				int e = optlog2(v[j]);
+
+				component_exp[j] = (component_exp[j] < e) ? e : component_exp[j];
+			}
+		}
+	}
+
 	for (size_t i = 0; i < count; ++i)
 	{
 		const float* v = &data[i * stride_float];
 		unsigned int* d = &destination[i * stride_float];

-		// use maximum exponent to encode values; this guarantees that mantissa is [-1, 1]
-		int exp = -100;
+		int vector_exp = min_exp;

-		for (size_t j = 0; j < stride_float; ++j)
+		if (mode == meshopt_EncodeExpSharedVector)
 		{
-			int e;
-			frexp(v[j], &e);
+			// use maximum exponent to encode values; this guarantees that mantissa is [-1, 1]
+			for (size_t j = 0; j < stride_float; ++j)
+			{
+				int e = optlog2(v[j]);

-			exp = (exp < e) ? e : exp;
+				vector_exp = (vector_exp < e) ? e : vector_exp;
+			}
+		}
+		else if (mode == meshopt_EncodeExpSeparate)
+		{
+			for (size_t j = 0; j < stride_float; ++j)
+			{
+				int e = optlog2(v[j]);
+
+				component_exp[j] = (min_exp < e) ? e : min_exp;
+			}
 		}

-		// note that we additionally scale the mantissa to make it a K-bit signed integer (K-1 bits for magnitude)
-		exp -= (bits - 1);
-
-		// compute renormalized rounded mantissa for each component
-		int mmask = (1 << 24) - 1;
-
 		for (size_t j = 0; j < stride_float; ++j)
 		{
-			int m = int(ldexp(v[j], -exp) + (v[j] >= 0 ? 0.5f : -0.5f));
+			int exp = (mode == meshopt_EncodeExpSharedVector) ? vector_exp : component_exp[j];
+
+			// note that we additionally scale the mantissa to make it a K-bit signed integer (K-1 bits for magnitude)
+			exp -= (bits - 1);
+
+			// compute renormalized rounded mantissa for each component
+			int mmask = (1 << 24) - 1;
+
+			int m = int(v[j] * optexp2(-exp) + (v[j] >= 0 ? 0.5f : -0.5f));

 			d[j] = (m & mmask) | (unsigned(exp) << 24);
 		}