editor updates: male dummy, meshoptimizer, cpp mesh export

2023-11-01 10:43:08 +01:00
parent d6c4ebf980
commit 98c208eabd
14 changed files with 55768 additions and 55348 deletions
@@ -529,102 +529,103 @@ void MeshWindow::Create(EditorComponent* _editor)
 		params.extensions.push_back("h");
 		params.type = wi::helper::FileDialogParams::TYPE::SAVE;
 		wi::helper::FileDialog(params, [=](std::string filename) {
-
-			// Bake transformed and skinned positions:
-			wi::vector<XMFLOAT3> vertices(mesh->vertex_positions.size());
-			const Scene& scene = editor->GetCurrentScene();
-			XMMATRIX M = XMMatrixIdentity();
-			if (editor->componentsWnd.objectWnd.entity != INVALID_ENTITY)
-			{
-				// if first selection is an object then transformation will be also applied
-				Entity object_entity = editor->componentsWnd.objectWnd.entity;
-				const ObjectComponent* object = scene.objects.GetComponent(object_entity);
-				if (object != nullptr)
+			wi::eventhandler::Subscribe_Once(wi::eventhandler::EVENT_THREAD_SAFE_POINT, [=](uint64_t userdata) {
+				// Bake transformed and skinned positions:
+				wi::vector<XMFLOAT3> vertices(mesh->vertex_positions.size());
+				const Scene& scene = editor->GetCurrentScene();
+				XMMATRIX M = XMMatrixIdentity();
+				if (editor->componentsWnd.objectWnd.entity != INVALID_ENTITY)
 				{
-					size_t index = scene.objects.GetIndex(object_entity);
-					M = XMLoadFloat4x4(&scene.matrix_objects[index]);
+					// if first selection is an object then transformation will be also applied
+					Entity object_entity = editor->componentsWnd.objectWnd.entity;
+					const ObjectComponent* object = scene.objects.GetComponent(object_entity);
+					if (object != nullptr)
+					{
+						size_t index = scene.objects.GetIndex(object_entity);
+						M = XMLoadFloat4x4(&scene.matrix_objects[index]);
+					}
 				}
-			}
-			const ArmatureComponent* armature = scene.armatures.GetComponent(mesh->armatureID);
-			for (size_t i = 0; i < mesh->vertex_positions.size(); ++i)
-			{
-				XMVECTOR P;
-				if (armature == nullptr)
+				const ArmatureComponent* armature = scene.armatures.GetComponent(mesh->armatureID);
+				for (size_t i = 0; i < mesh->vertex_positions.size(); ++i)
 				{
-					P = XMLoadFloat3(&mesh->vertex_positions[i]);
+					XMVECTOR P;
+					if (armature == nullptr)
+					{
+						P = XMLoadFloat3(&mesh->vertex_positions[i]);
+					}
+					else
+					{
+						P = wi::scene::SkinVertex(*mesh, *armature, (uint32_t)i);
+					}
+					P = XMVector3Transform(P, M);
+					XMStoreFloat3(&vertices[i], P);
+				}
+
+				// Gather all indices for all subsets in LOD0:
+				wi::vector<uint32_t> indices;
+				uint32_t first_subset = 0;
+				uint32_t last_subset = 0;
+				mesh->GetLODSubsetRange(0, first_subset, last_subset);
+				for (uint32_t subsetIndex = first_subset; subsetIndex < last_subset; ++subsetIndex)
+				{
+					const MeshComponent::MeshSubset& subset = mesh->subsets[subsetIndex];
+					if (subset.indexCount == 0)
+						continue;
+					for (uint32_t i = 0; i < subset.indexCount; ++i)
+					{
+						indices.push_back(mesh->indices[subset.indexOffset + i]);
+					}
+				}
+
+				// Generate shadow indices for position-only stream:
+				wi::vector<uint32_t> shadow_indices(indices.size() * 2); // *2 fixes some weird memory oob write issue with a specific model
+				meshopt_generateShadowIndexBuffer(
+					shadow_indices.data(), indices.data(), indices.size(),
+					vertices.data(), vertices.size(), sizeof(XMFLOAT3), sizeof(XMFLOAT3)
+				);
+
+				// De-duplicate vertices based on shadow index buffer:
+				wi::vector<unsigned int> remap(shadow_indices.size());
+				const size_t vertex_count = meshopt_generateVertexRemap(
+					remap.data(),
+					shadow_indices.data(), shadow_indices.size(),
+					vertices.data(), vertices.size(), sizeof(XMFLOAT3)
+				);
+				wi::vector<XMFLOAT3> remapped_vertices(vertex_count);
+				wi::vector<uint32_t> remapped_indices(shadow_indices.size());
+				meshopt_remapIndexBuffer(remapped_indices.data(), shadow_indices.data(), shadow_indices.size(), remap.data());
+				meshopt_remapVertexBuffer(remapped_vertices.data(), vertices.data(), vertices.size() /*initial vertex count, not the one returned from meshopt_generateVertexRemap*/, sizeof(XMFLOAT3), remap.data());
+
+				// Optimizations:
+				meshopt_optimizeVertexCache(remapped_indices.data(), remapped_indices.data(), remapped_indices.size(), vertex_count);
+				meshopt_optimizeVertexFetch(remapped_vertices.data(), remapped_indices.data(), remapped_indices.size(), remapped_vertices.data(), vertex_count, sizeof(XMFLOAT3));
+
+				// Generate C++ header syntax:
+				std::string str;
+				str += "static const float3 vertices[" + std::to_string(remapped_vertices.size()) + "] = {\n";
+				for (auto& pos : remapped_vertices)
+				{
+					str += "\tfloat3(" + std::to_string(pos.x) + "f," + std::to_string(pos.y) + "f," + std::to_string(pos.z) + "f),\n";
+				}
+				str += "};\n";
+				str += "static const unsigned int indices[" + std::to_string(remapped_indices.size()) + "] = {\n";
+				for (size_t i = 0; i < remapped_indices.size(); i += 3)
+				{
+					str += "\t" + std::to_string(remapped_indices[i + 0]) + "," + std::to_string(remapped_indices[i + 1]) + "," + std::to_string(remapped_indices[i + 2]) + ",\n";
+				}
+				str += "};\n";
+
+				// Write to file:
+				std::string filename_dest = wi::helper::ForceExtension(filename, "h");
+				if (wi::helper::FileWrite(filename_dest, (uint8_t*)str.c_str(), str.length()))
+				{
+					editor->PostSaveText("Mesh exported to header file: ", filename_dest);
 				}
 				else
 				{
-					P = wi::scene::SkinVertex(*mesh, *armature, (uint32_t)i);
+					editor->PostSaveText("Failed to write file: ", filename_dest);
 				}
-				P = XMVector3Transform(P, M);
-				XMStoreFloat3(&vertices[i], P);
-			}
-
-			// Gather all indices for all subsets in LOD0:
-			wi::vector<uint32_t> indices;
-			uint32_t first_subset = 0;
-			uint32_t last_subset = 0;
-			mesh->GetLODSubsetRange(0, first_subset, last_subset);
-			for (uint32_t subsetIndex = first_subset; subsetIndex < last_subset; ++subsetIndex)
-			{
-				const MeshComponent::MeshSubset& subset = mesh->subsets[subsetIndex];
-				if (subset.indexCount == 0)
-					continue;
-				for (uint32_t i = 0; i < subset.indexCount; ++i)
-				{
-					indices.push_back(mesh->indices[subset.indexOffset + i]);
-				}
-			}
-
-			// Generate shadow indices for position-only stream:
-			wi::vector<uint32_t> shadow_indices(indices.size());
-			meshopt_generateShadowIndexBuffer(
-				shadow_indices.data(), indices.data(), indices.size(),
-				vertices.data(), vertices.size(), sizeof(XMFLOAT3), sizeof(XMFLOAT3)
-			);
-
-			// De-duplicate vertices based on shadow index buffer:
-			wi::vector<unsigned int> remap(shadow_indices.size());
-			const size_t vertex_count = meshopt_generateVertexRemap(
-				remap.data(),
-				shadow_indices.data(), shadow_indices.size(),
-				vertices.data(), vertices.size(), sizeof(XMFLOAT3)
-			);
-			wi::vector<XMFLOAT3> remapped_vertices(vertex_count);
-			wi::vector<uint32_t> remapped_indices(shadow_indices.size());
-			meshopt_remapIndexBuffer(remapped_indices.data(), shadow_indices.data(), shadow_indices.size(), remap.data());
-			meshopt_remapVertexBuffer(remapped_vertices.data(), vertices.data(), vertices.size() /*initial vertex count, not the one returned from meshopt_generateVertexRemap*/, sizeof(XMFLOAT3), remap.data());
-
-			// Optimizations:
-			meshopt_optimizeVertexCache(remapped_indices.data(), remapped_indices.data(), remapped_indices.size(), vertex_count);
-			meshopt_optimizeVertexFetch(remapped_vertices.data(), remapped_indices.data(), remapped_indices.size(), remapped_vertices.data(), vertex_count, sizeof(XMFLOAT3));
-
-			// Generate C++ header syntax:
-			std::string str;
-			str += "static const float3 vertices[" + std::to_string(remapped_vertices.size()) + "] = {\n";
-			for (auto& pos : remapped_vertices)
-			{
-				str += "\tfloat3(" + std::to_string(pos.x) + "f," + std::to_string(pos.y) + "f," + std::to_string(pos.z) + "f),\n";
-			}
-			str += "};\n";
-			str += "static const unsigned int indices[" + std::to_string(remapped_indices.size()) + "] = {\n";
-			for (size_t i = 0; i < remapped_indices.size(); i += 3)
-			{
-				str += "\t" + std::to_string(remapped_indices[i + 0]) + "," + std::to_string(remapped_indices[i + 1]) + "," + std::to_string(remapped_indices[i + 2]) + ",\n";
-			}
-			str += "};\n";
-
-			// Write to file:
-			filename = wi::helper::ForceExtension(filename, "h");
-			if (wi::helper::FileWrite(filename, (uint8_t*)str.c_str(), str.length()))
-			{
-				editor->PostSaveText("Mesh exported to header file: ", filename);
-			}
-			else
-			{
-				editor->PostSaveText("Failed to write file: ", filename);
-			}
+			});
 		});
 	});
 	AddWidget(&exportHeaderButton);
@@ -283,6 +283,79 @@ static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int
 	return result;
 }

+static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone* meshlet_cone, unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const unsigned char* used, float meshlet_expected_radius, float cone_weight, unsigned int* out_extra)
+{
+	unsigned int best_triangle = ~0u;
+	unsigned int best_extra = 5;
+	float best_score = FLT_MAX;
+
+	for (size_t i = 0; i < meshlet.vertex_count; ++i)
+	{
+		unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];
+
+		unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
+		size_t neighbors_size = adjacency.counts[index];
+
+		for (size_t j = 0; j < neighbors_size; ++j)
+		{
+			unsigned int triangle = neighbors[j];
+			unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
+
+			unsigned int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff);
+
+			// triangles that don't add new vertices to meshlets are max. priority
+			if (extra != 0)
+			{
+				// artificially increase the priority of dangling triangles as they're expensive to add to new meshlets
+				if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)
+					extra = 0;
+
+				extra++;
+			}
+
+			// since topology-based priority is always more important than the score, we can skip scoring in some cases
+			if (extra > best_extra)
+				continue;
+
+			float score = 0;
+
+			// caller selects one of two scoring functions: geometrical (based on meshlet cone) or topological (based on remaining triangles)
+			if (meshlet_cone)
+			{
+				const Cone& tri_cone = triangles[triangle];
+
+				float distance2 =
+				    (tri_cone.px - meshlet_cone->px) * (tri_cone.px - meshlet_cone->px) +
+				    (tri_cone.py - meshlet_cone->py) * (tri_cone.py - meshlet_cone->py) +
+				    (tri_cone.pz - meshlet_cone->pz) * (tri_cone.pz - meshlet_cone->pz);
+
+				float spread = tri_cone.nx * meshlet_cone->nx + tri_cone.ny * meshlet_cone->ny + tri_cone.nz * meshlet_cone->nz;
+
+				score = getMeshletScore(distance2, spread, cone_weight, meshlet_expected_radius);
+			}
+			else
+			{
+				// each live_triangles entry is >= 1 since it includes the current triangle we're processing
+				score = float(live_triangles[a] + live_triangles[b] + live_triangles[c] - 3);
+			}
+
+			// note that topology-based priority is always more important than the score
+			// this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost
+			if (extra < best_extra || score < best_score)
+			{
+				best_triangle = triangle;
+				best_extra = extra;
+				best_score = score;
+			}
+		}
+	}
+
+	if (out_extra)
+		*out_extra = best_extra;
+
+	return best_triangle;
+}
+
 struct KDNode
 {
 	union
@@ -464,13 +537,15 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
 	using namespace meshopt;

 	assert(index_count % 3 == 0);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);

 	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
 	assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
 	assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned

+	assert(cone_weight >= 0 && cone_weight <= 1);
+
 	meshopt_Allocator allocator;

 	TriangleAdjacency2 adjacency = {};
@@ -511,65 +586,18 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve

 	for (;;)
 	{
-		unsigned int best_triangle = ~0u;
-		unsigned int best_extra = 5;
-		float best_score = FLT_MAX;
-
 		Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count);

-		for (size_t i = 0; i < meshlet.vertex_count; ++i)
+		unsigned int best_extra = 0;
+		unsigned int best_triangle = getNeighborTriangle(meshlet, &meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight, &best_extra);
+
+		// if the best triangle doesn't fit into current meshlet, the spatial scoring we've used is not very meaningful, so we re-select using topological scoring
+		if (best_triangle != ~0u && (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles))
 		{
-			unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];
-
-			unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index];
-			size_t neighbours_size = adjacency.counts[index];
-
-			for (size_t j = 0; j < neighbours_size; ++j)
-			{
-				unsigned int triangle = neighbours[j];
-				assert(!emitted_flags[triangle]);
-
-				unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
-				assert(a < vertex_count && b < vertex_count && c < vertex_count);
-
-				unsigned int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff);
-
-				// triangles that don't add new vertices to meshlets are max. priority
-				if (extra != 0)
-				{
-					// artificially increase the priority of dangling triangles as they're expensive to add to new meshlets
-					if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)
-						extra = 0;
-
-					extra++;
-				}
-
-				// since topology-based priority is always more important than the score, we can skip scoring in some cases
-				if (extra > best_extra)
-					continue;
-
-				const Cone& tri_cone = triangles[triangle];
-
-				float distance2 =
-				    (tri_cone.px - meshlet_cone.px) * (tri_cone.px - meshlet_cone.px) +
-				    (tri_cone.py - meshlet_cone.py) * (tri_cone.py - meshlet_cone.py) +
-				    (tri_cone.pz - meshlet_cone.pz) * (tri_cone.pz - meshlet_cone.pz);
-
-				float spread = tri_cone.nx * meshlet_cone.nx + tri_cone.ny * meshlet_cone.ny + tri_cone.nz * meshlet_cone.nz;
-
-				float score = getMeshletScore(distance2, spread, cone_weight, meshlet_expected_radius);
-
-				// note that topology-based priority is always more important than the score
-				// this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost
-				if (extra < best_extra || score < best_score)
-				{
-					best_triangle = triangle;
-					best_extra = extra;
-					best_score = score;
-				}
-			}
+			best_triangle = getNeighborTriangle(meshlet, NULL, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, 0.f, NULL);
 		}

+		// when we run out of neighboring triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity
 		if (best_triangle == ~0u)
 		{
 			float position[3] = {meshlet_cone.px, meshlet_cone.py, meshlet_cone.pz};
@@ -604,16 +632,16 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
 		{
 			unsigned int index = indices[best_triangle * 3 + k];

-			unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index];
-			size_t neighbours_size = adjacency.counts[index];
+			unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
+			size_t neighbors_size = adjacency.counts[index];

-			for (size_t i = 0; i < neighbours_size; ++i)
+			for (size_t i = 0; i < neighbors_size; ++i)
 			{
-				unsigned int tri = neighbours[i];
+				unsigned int tri = neighbors[i];

 				if (tri == best_triangle)
 				{
-					neighbours[i] = neighbours[neighbours_size - 1];
+					neighbors[i] = neighbors[neighbors_size - 1];
 					adjacency.counts[index]--;
 					break;
 				}
@@ -687,7 +715,7 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t

 	assert(index_count % 3 == 0);
 	assert(index_count / 3 <= kMeshletMaxTriangles);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);

 	(void)vertex_count;
@@ -839,7 +867,7 @@ meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices
 	using namespace meshopt;

 	assert(triangle_count <= kMeshletMaxTriangles);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);

 	unsigned int indices[kMeshletMaxTriangles * 3];
@@ -13,7 +13,7 @@ namespace meshopt
 const unsigned char kIndexHeader = 0xe0;
 const unsigned char kSequenceHeader = 0xd0;

-static int gEncodeIndexVersion = 0;
+static int gEncodeIndexVersion = 1;

 typedef unsigned int VertexFifo[16];
 typedef unsigned int EdgeFifo[16][2];
@@ -157,7 +157,7 @@ static T* hashLookup(T* table, size_t buckets, const Hash& hash, const T& key, c
 	}

 	assert(false && "Hash table is full"); // unreachable
-	return 0;
+	return NULL;
 }

 static void buildPositionRemap(unsigned int* remap, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, meshopt_Allocator& allocator)
@@ -178,6 +178,22 @@ static void buildPositionRemap(unsigned int* remap, const float* vertex_position

 		remap[index] = *entry;
 	}
+
+	allocator.deallocate(vertex_table);
+}
+
+template <size_t BlockSize>
+static void remapVertices(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
+{
+	size_t block_size = BlockSize == 0 ? vertex_size : BlockSize;
+	assert(block_size == vertex_size);
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		if (remap[i] != ~0u)
+		{
+			assert(remap[i] < vertex_count);
+			memcpy(static_cast<unsigned char*>(destination) + remap[i] * block_size, static_cast<const unsigned char*>(vertices) + i * block_size, block_size);
+		}
 }

 } // namespace meshopt
@@ -187,7 +203,7 @@ size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int
 	using namespace meshopt;

 	assert(indices || index_count == vertex_count);
-	assert(index_count % 3 == 0);
+	assert(!indices || index_count % 3 == 0);
 	assert(vertex_size > 0 && vertex_size <= 256);

 	meshopt_Allocator allocator;
@@ -288,6 +304,8 @@ size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigne

 void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
 {
+	using namespace meshopt;
+
 	assert(vertex_size > 0 && vertex_size <= 256);

 	meshopt_Allocator allocator;
@@ -300,14 +318,23 @@ void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t v
 		vertices = vertices_copy;
 	}

-	for (size_t i = 0; i < vertex_count; ++i)
+	// specialize the loop for common vertex sizes to ensure memcpy is compiled as an inlined intrinsic
+	switch (vertex_size)
 	{
-		if (remap[i] != ~0u)
-		{
-			assert(remap[i] < vertex_count);
+	case 4:
+		return remapVertices<4>(destination, vertices, vertex_count, vertex_size, remap);

-			memcpy(static_cast<unsigned char*>(destination) + remap[i] * vertex_size, static_cast<const unsigned char*>(vertices) + i * vertex_size, vertex_size);
-		}
+	case 8:
+		return remapVertices<8>(destination, vertices, vertex_count, vertex_size, remap);
+
+	case 12:
+		return remapVertices<12>(destination, vertices, vertex_count, vertex_size, remap);
+
+	case 16:
+		return remapVertices<16>(destination, vertices, vertex_count, vertex_size, remap);
+
+	default:
+		return remapVertices<0>(destination, vertices, vertex_count, vertex_size, remap);
 	}
 }

@@ -412,7 +439,7 @@ void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsig
 	using namespace meshopt;

 	assert(index_count % 3 == 0);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);

 	meshopt_Allocator allocator;
@@ -483,7 +510,7 @@ void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const un
 	using namespace meshopt;

 	assert(index_count % 3 == 0);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);

 	meshopt_Allocator allocator;
@@ -147,7 +147,7 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices,
 	using namespace meshopt;

 	assert(index_count % 3 == 0);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);

 	meshopt_Allocator allocator;
@@ -272,7 +272,7 @@ void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* ind
 	using namespace meshopt;

 	assert(index_count % 3 == 0);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);

 	meshopt_Allocator allocator;
@@ -0,0 +1,70 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+
+unsigned short meshopt_quantizeHalf(float v)
+{
+	union { float f; unsigned int ui; } u = {v};
+	unsigned int ui = u.ui;
+
+	int s = (ui >> 16) & 0x8000;
+	int em = ui & 0x7fffffff;
+
+	// bias exponent and round to nearest; 112 is relative exponent bias (127-15)
+	int h = (em - (112 << 23) + (1 << 12)) >> 13;
+
+	// underflow: flush to zero; 113 encodes exponent -14
+	h = (em < (113 << 23)) ? 0 : h;
+
+	// overflow: infinity; 143 encodes exponent 16
+	h = (em >= (143 << 23)) ? 0x7c00 : h;
+
+	// NaN; note that we convert all types of NaN to qNaN
+	h = (em > (255 << 23)) ? 0x7e00 : h;
+
+	return (unsigned short)(s | h);
+}
+
+float meshopt_quantizeFloat(float v, int N)
+{
+	assert(N >= 0 && N <= 23);
+
+	union { float f; unsigned int ui; } u = {v};
+	unsigned int ui = u.ui;
+
+	const int mask = (1 << (23 - N)) - 1;
+	const int round = (1 << (23 - N)) >> 1;
+
+	int e = ui & 0x7f800000;
+	unsigned int rui = (ui + round) & ~mask;
+
+	// round all numbers except inf/nan; this is important to make sure nan doesn't overflow into -0
+	ui = e == 0x7f800000 ? ui : rui;
+
+	// flush denormals to zero
+	ui = e == 0 ? 0 : ui;
+
+	u.ui = ui;
+	return u.f;
+}
+
+float meshopt_dequantizeHalf(unsigned short h)
+{
+	unsigned int s = unsigned(h & 0x8000) << 16;
+	int em = h & 0x7fff;
+
+	// bias exponent and pad mantissa with 0; 112 is relative exponent bias (127-15)
+	int r = (em + (112 << 10)) << 13;
+
+	// denormal: flush to zero
+	r = (em < (1 << 10)) ? 0 : r;
+
+	// infinity/NaN; note that we preserve NaN payload as a byproduct of unifying inf/nan cases
+	// 112 is an exponent bias fixup; since we already applied it once, applying it twice converts 31 to 255
+	r += (em >= (31 << 10)) ? (112 << 23) : 0;
+
+	union { float f; unsigned int ui; } u;
+	u.ui = s | r;
+	return u.f;
+}
@@ -113,7 +113,7 @@ void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_pos
 {
 	using namespace meshopt;

-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);

 	meshopt_Allocator allocator;
@@ -144,7 +144,7 @@ void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int*
 	using namespace meshopt;

 	assert(index_count % 3 == 0);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);

 	(void)vertex_count;
@@ -110,7 +110,7 @@ static unsigned int getNextVertexDeadEnd(const unsigned int* dead_end, unsigned
 	return ~0u;
 }

-static unsigned int getNextVertexNeighbour(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size)
+static unsigned int getNextVertexNeighbor(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size)
 {
 	unsigned int best_candidate = ~0u;
 	int best_priority = -1;
@@ -221,9 +221,9 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
 		triangle_scores[i] = vertex_scores[a] + vertex_scores[b] + vertex_scores[c];
 	}

-	unsigned int cache_holder[2 * (kCacheSizeMax + 3)];
+	unsigned int cache_holder[2 * (kCacheSizeMax + 4)];
 	unsigned int* cache = cache_holder;
-	unsigned int* cache_new = cache_holder + kCacheSizeMax + 3;
+	unsigned int* cache_new = cache_holder + kCacheSizeMax + 4;
 	size_t cache_count = 0;

 	unsigned int current_triangle = 0;
@@ -260,10 +260,8 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
 		{
 			unsigned int index = cache[i];

-			if (index != a && index != b && index != c)
-			{
-				cache_new[cache_write++] = index;
-			}
+			cache_new[cache_write] = index;
+			cache_write += (index != a && index != b && index != c);
 		}

 		unsigned int* cache_temp = cache;
@@ -281,16 +279,16 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
 		{
 			unsigned int index = indices[current_triangle * 3 + k];

-			unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index];
-			size_t neighbours_size = adjacency.counts[index];
+			unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
+			size_t neighbors_size = adjacency.counts[index];

-			for (size_t i = 0; i < neighbours_size; ++i)
+			for (size_t i = 0; i < neighbors_size; ++i)
 			{
-				unsigned int tri = neighbours[i];
+				unsigned int tri = neighbors[i];

 				if (tri == current_triangle)
 				{
-					neighbours[i] = neighbours[neighbours_size - 1];
+					neighbors[i] = neighbors[neighbors_size - 1];
 					adjacency.counts[index]--;
 					break;
 				}
@@ -305,6 +303,10 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
 		{
 			unsigned int index = cache[i];

+			// no need to update scores if we are never going to use this vertex
+			if (adjacency.counts[index] == 0)
+				continue;
+
 			int cache_position = i >= cache_size ? -1 : int(i);

 			// update vertex score
@@ -314,10 +316,10 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
 			vertex_scores[index] = score;

 			// update scores of vertex triangles
-			const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[index];
-			const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[index];
+			const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[index];
+			const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[index];

-			for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it)
+			for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it)
 			{
 				unsigned int tri = *it;
 				assert(!emitted_flags[tri]);
@@ -325,11 +327,8 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
 				float tri_score = triangle_scores[tri] + score_diff;
 				assert(tri_score > 0);

-				if (best_score < tri_score)
-				{
-					best_triangle = tri;
-					best_score = tri_score;
-				}
+				best_triangle = best_score < tri_score ? tri : best_triangle;
+				best_score = best_score < tri_score ? tri_score : best_score;

 				triangle_scores[tri] = tri_score;
 			}
@@ -412,11 +411,11 @@ void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned i
 	{
 		const unsigned int* next_candidates_begin = &dead_end[0] + dead_end_top;

-		// emit all vertex neighbours
-		const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[current_vertex];
-		const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[current_vertex];
+		// emit all vertex neighbors
+		const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[current_vertex];
+		const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[current_vertex];

-		for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it)
+		for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it)
 		{
 			unsigned int triangle = *it;

@@ -461,7 +460,7 @@ void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned i
 		const unsigned int* next_candidates_end = &dead_end[0] + dead_end_top;

 		// get next vertex
-		current_vertex = getNextVertexNeighbour(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size);
+		current_vertex = getNextVertexNeighbor(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size);

 		if (current_vertex == ~0u)
 		{
@@ -44,12 +44,22 @@
 // When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
 #if defined(__wasm_simd128__)
 #define SIMD_WASM
+// Prevent compiling other variant when wasm simd compilation is active
+#undef SIMD_NEON
+#undef SIMD_SSE
+#undef SIMD_AVX
 #endif

 #ifndef SIMD_TARGET
 #define SIMD_TARGET
 #endif

+// When targeting AArch64/x64, optimize for latency to allow decoding of individual 16-byte groups to overlap
+// We don't do this for 32-bit systems because we need 64-bit math for this and this will hurt in-order CPUs
+#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64)
+#define SIMD_LATENCYOPT
+#endif
+
 #endif // !MESHOPTIMIZER_NO_SIMD

 #ifdef SIMD_SSE
@@ -77,19 +87,17 @@
 #endif

 #ifdef SIMD_WASM
-#undef __DEPRECATED
-#pragma clang diagnostic ignored "-Wdeprecated-declarations"
 #include <wasm_simd128.h>
 #endif

 #ifdef SIMD_WASM
-#define wasmx_splat_v32x4(v, i) wasm_v32x4_shuffle(v, v, i, i, i, i)
-#define wasmx_unpacklo_v8x16(a, b) wasm_v8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
-#define wasmx_unpackhi_v8x16(a, b) wasm_v8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
-#define wasmx_unpacklo_v16x8(a, b) wasm_v16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
-#define wasmx_unpackhi_v16x8(a, b) wasm_v16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
-#define wasmx_unpacklo_v64x2(a, b) wasm_v64x2_shuffle(a, b, 0, 2)
-#define wasmx_unpackhi_v64x2(a, b) wasm_v64x2_shuffle(a, b, 1, 3)
+#define wasmx_splat_v32x4(v, i) wasm_i32x4_shuffle(v, v, i, i, i, i)
+#define wasmx_unpacklo_v8x16(a, b) wasm_i8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
+#define wasmx_unpackhi_v8x16(a, b) wasm_i8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
+#define wasmx_unpacklo_v16x8(a, b) wasm_i16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
+#define wasmx_unpackhi_v16x8(a, b) wasm_i16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
+#define wasmx_unpacklo_v64x2(a, b) wasm_i64x2_shuffle(a, b, 0, 2)
+#define wasmx_unpackhi_v64x2(a, b) wasm_i64x2_shuffle(a, b, 1, 3)
 #endif

 namespace meshopt
@@ -212,7 +220,7 @@ static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end,
 	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;

 	if (size_t(data_end - data) < header_size)
-		return 0;
+		return NULL;

 	data += header_size;

@@ -221,7 +229,7 @@ static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end,
 	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
 	{
 		if (size_t(data_end - data) < kByteGroupDecodeLimit)
-			return 0;
+			return NULL;

 		int best_bits = 8;
 		size_t best_size = encodeBytesGroupMeasure(buffer + i, 8);
@@ -280,7 +288,7 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data

 		data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1));
 		if (!data)
-			return 0;
+			return NULL;
 	}

 	memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
@@ -288,7 +296,7 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data
 	return data;
 }

-#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX))
+#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX) && !defined(SIMD_WASM))
 static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
 {
 #define READ() byte = *data++
@@ -348,14 +356,14 @@ static const unsigned char* decodeBytes(const unsigned char* data, const unsigne
 	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;

 	if (size_t(data_end - data) < header_size)
-		return 0;
+		return NULL;

 	data += header_size;

 	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
 	{
 		if (size_t(data_end - data) < kByteGroupDecodeLimit)
-			return 0;
+			return NULL;

 		size_t header_offset = i / kByteGroupSize;

@@ -380,7 +388,7 @@ static const unsigned char* decodeVertexBlock(const unsigned char* data, const u
 	{
 		data = decodeBytes(data, data_end, buffer, vertex_count_aligned);
 		if (!data)
-			return 0;
+			return NULL;

 		size_t vertex_offset = k;

@@ -472,6 +480,18 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		typedef int unaligned_int;
 #endif

+#ifdef SIMD_LATENCYOPT
+		unsigned int data32;
+		memcpy(&data32, data, 4);
+		data32 &= data32 >> 1;
+
+		// arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
+		unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
+
+		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
+		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
+#endif
+
 		__m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data));
 		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4));

@@ -490,11 +510,25 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi

 		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);

+#ifdef SIMD_LATENCYOPT
+		return data + 4 + datacnt;
+#else
 		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+#endif
 	}

 	case 2:
 	{
+#ifdef SIMD_LATENCYOPT
+		unsigned long long data64;
+		memcpy(&data64, data, 8);
+		data64 &= data64 >> 1;
+		data64 &= data64 >> 2;
+
+		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
+		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
+#endif
+
 		__m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
 		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8));

@@ -512,7 +546,11 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi

 		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);

+#ifdef SIMD_LATENCYOPT
+		return data + 8 + datacnt;
+#else
 		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+#endif
 	}

 	case 3:
@@ -604,24 +642,13 @@ static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8

 static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
 {
-	static const unsigned char byte_mask_data[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128};
+	// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
+	const uint64_t magic = 0x000103070f1f3f80ull;

-	uint8x16_t byte_mask = vld1q_u8(byte_mask_data);
-	uint8x16_t masked = vandq_u8(mask, byte_mask);
+	uint64x2_t mask2 = vreinterpretq_u64_u8(mask);

-#ifdef __aarch64__
-	// aarch64 has horizontal sums; MSVC doesn't expose this via arm64_neon.h so this path is exclusive to clang/gcc
-	mask0 = vaddv_u8(vget_low_u8(masked));
-	mask1 = vaddv_u8(vget_high_u8(masked));
-#else
-	// we need horizontal sums of each half of masked, which can be done in 3 steps (yielding sums of sizes 2, 4, 8)
-	uint8x8_t sum1 = vpadd_u8(vget_low_u8(masked), vget_high_u8(masked));
-	uint8x8_t sum2 = vpadd_u8(sum1, sum1);
-	uint8x8_t sum3 = vpadd_u8(sum2, sum2);
-
-	mask0 = vget_lane_u8(sum3, 0);
-	mask1 = vget_lane_u8(sum3, 1);
-#endif
+	mask0 = uint8_t((vgetq_lane_u64(mask2, 0) * magic) >> 56);
+	mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56);
 }

 static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
@@ -639,6 +666,18 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi

 	case 1:
 	{
+#ifdef SIMD_LATENCYOPT
+		unsigned int data32;
+		memcpy(&data32, data, 4);
+		data32 &= data32 >> 1;
+
+		// arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
+		unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
+
+		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
+		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
+#endif
+
 		uint8x8_t sel2 = vld1_u8(data);
 		uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0];
 		uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22);
@@ -655,11 +694,25 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi

 		vst1q_u8(buffer, result);

+#ifdef SIMD_LATENCYOPT
+		return data + 4 + datacnt;
+#else
 		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+#endif
 	}

 	case 2:
 	{
+#ifdef SIMD_LATENCYOPT
+		unsigned long long data64;
+		memcpy(&data64, data, 8);
+		data64 &= data64 >> 1;
+		data64 &= data64 >> 2;
+
+		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
+		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
+#endif
+
 		uint8x8_t sel4 = vld1_u8(data);
 		uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15)));
 		uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]);
@@ -675,7 +728,11 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi

 		vst1q_u8(buffer, result);

+#ifdef SIMD_LATENCYOPT
+		return data + 8 + datacnt;
+#else
 		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+#endif
 	}

 	case 3:
@@ -702,7 +759,7 @@ static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
 	v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);

 	v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]);
-	sm1off = wasm_v8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+	sm1off = wasm_i8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);

 	v128_t sm1r = wasm_i8x16_add(sm1, sm1off);

@@ -715,7 +772,6 @@ static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1
 	// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
 	const uint64_t magic = 0x000103070f1f3f80ull;

-	// TODO: This can use v8x16_bitmask in the future
 	mask0 = uint8_t((wasm_i64x2_extract_lane(mask, 0) * magic) >> 56);
 	mask1 = uint8_t((wasm_i64x2_extract_lane(mask, 1) * magic) >> 56);
 }
@@ -723,9 +779,6 @@ static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1
 SIMD_TARGET
 static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
 {
-	unsigned char byte, enc, encv;
-	const unsigned char* data_var;
-
 	switch (bitslog2)
 	{
 	case 0:
@@ -753,7 +806,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi

 		v128_t shuf = decodeShuffleMask(mask0, mask1);

-		v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask);
+		v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);

 		wasm_v128_store(buffer, result);

@@ -775,7 +828,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi

 		v128_t shuf = decodeShuffleMask(mask0, mask1);

-		v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask);
+		v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);

 		wasm_v128_store(buffer, result);

@@ -885,7 +938,7 @@ static const unsigned char* decodeBytesSimd(const unsigned char* data, const uns
 	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;

 	if (size_t(data_end - data) < header_size)
-		return 0;
+		return NULL;

 	data += header_size;

@@ -907,7 +960,7 @@ static const unsigned char* decodeBytesSimd(const unsigned char* data, const uns
 	for (; i < buffer_size; i += kByteGroupSize)
 	{
 		if (size_t(data_end - data) < kByteGroupDecodeLimit)
-			return 0;
+			return NULL;

 		size_t header_offset = i / kByteGroupSize;

@@ -935,7 +988,7 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con
 		{
 			data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned);
 			if (!data)
-				return 0;
+				return NULL;
 		}

 #if defined(SIMD_SSE) || defined(SIMD_AVX)
@@ -1129,7 +1182,7 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
 	assert(vertex_size > 0 && vertex_size <= 256);
 	assert(vertex_size % 4 == 0);

-	const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = 0;
+	const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = NULL;

 #if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
 	decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
@@ -30,6 +30,9 @@
 // When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
 #if defined(__wasm_simd128__)
 #define SIMD_WASM
+// Prevent compiling other variant when wasm simd compilation is active
+#undef SIMD_NEON
+#undef SIMD_SSE
 #endif

 #endif // !MESHOPTIMIZER_NO_SIMD
@@ -63,6 +66,10 @@
 #define wasmx_unziphi_v32x4(a, b) wasm_v32x4_shuffle(a, b, 1, 3, 5, 7)
 #endif

+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
 namespace meshopt
 {

@@ -185,9 +192,7 @@ inline uint64_t rotateleft64(uint64_t v, int x)
 {
 #if defined(_MSC_VER) && !defined(__clang__)
 	return _rotl64(v, x);
-// Apple's Clang 8 is actually vanilla Clang 3.9, there we need to look for
-// version 11 instead: https://en.wikipedia.org/wiki/Xcode#Toolchain_versions
-#elif defined(__clang__) && ((!defined(__apple_build_version__) && __clang_major__ >= 8) || __clang_major__ >= 11)
+#elif defined(__clang__) && __has_builtin(__builtin_rotateleft64)
 	return __builtin_rotateleft64(v, x);
 #else
 	return (v << (x & 63)) | (v >> ((64 - x) & 63));
@@ -791,6 +796,33 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count)
 }
 #endif

+// optimized variant of frexp
+inline int optlog2(float v)
+{
+	union
+	{
+		float f;
+		unsigned int ui;
+	} u;
+
+	u.f = v;
+	// +1 accounts for implicit 1. in mantissa; denormalized numbers will end up clamped to min_exp by calling code
+	return u.ui == 0 ? 0 : int((u.ui >> 23) & 0xff) - 127 + 1;
+}
+
+// optimized variant of ldexp
+inline float optexp2(int e)
+{
+	union
+	{
+		float f;
+		unsigned int ui;
+	} u;
+
+	u.ui = unsigned(e + 127) << 23;
+	return u.f;
+}
+
 } // namespace meshopt

 void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride)
@@ -918,39 +950,78 @@ void meshopt_encodeFilterQuat(void* destination_, size_t count, size_t stride, i
 	}
 }

-void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, int bits, const float* data)
+void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode)
 {
-	assert(stride > 0 && stride % 4 == 0);
+	using namespace meshopt;
+
+	assert(stride > 0 && stride % 4 == 0 && stride <= 256);
 	assert(bits >= 1 && bits <= 24);

 	unsigned int* destination = static_cast<unsigned int*>(destination_);
 	size_t stride_float = stride / sizeof(float);

+	int component_exp[64];
+	assert(stride_float <= sizeof(component_exp) / sizeof(int));
+
+	const int min_exp = -100;
+
+	if (mode == meshopt_EncodeExpSharedComponent)
+	{
+		for (size_t j = 0; j < stride_float; ++j)
+			component_exp[j] = min_exp;
+
+		for (size_t i = 0; i < count; ++i)
+		{
+			const float* v = &data[i * stride_float];
+
+			// use maximum exponent to encode values; this guarantees that mantissa is [-1, 1]
+			for (size_t j = 0; j < stride_float; ++j)
+			{
+				int e = optlog2(v[j]);
+
+				component_exp[j] = (component_exp[j] < e) ? e : component_exp[j];
+			}
+		}
+	}
+
 	for (size_t i = 0; i < count; ++i)
 	{
 		const float* v = &data[i * stride_float];
 		unsigned int* d = &destination[i * stride_float];

-		// use maximum exponent to encode values; this guarantess that mantissa is [-1, 1]
-		int exp = -100;
+		int vector_exp = min_exp;

-		for (size_t j = 0; j < stride_float; ++j)
+		if (mode == meshopt_EncodeExpSharedVector)
 		{
-			int e;
-			frexp(v[j], &e);
+			// use maximum exponent to encode values; this guarantees that mantissa is [-1, 1]
+			for (size_t j = 0; j < stride_float; ++j)
+			{
+				int e = optlog2(v[j]);

-			exp = (exp < e) ? e : exp;
+				vector_exp = (vector_exp < e) ? e : vector_exp;
+			}
+		}
+		else if (mode == meshopt_EncodeExpSeparate)
+		{
+			for (size_t j = 0; j < stride_float; ++j)
+			{
+				int e = optlog2(v[j]);
+
+				component_exp[j] = (min_exp < e) ? e : min_exp;
+			}
 		}

-		// note that we additionally scale the mantissa to make it a K-bit signed integer (K-1 bits for magnitude)
-		exp -= (bits - 1);
-
-		// compute renormalized rounded mantissa for each component
-		int mmask = (1 << 24) - 1;
-
 		for (size_t j = 0; j < stride_float; ++j)
 		{
-			int m = int(ldexp(v[j], -exp) + (v[j] >= 0 ? 0.5f : -0.5f));
+			int exp = (mode == meshopt_EncodeExpSharedVector) ? vector_exp : component_exp[j];
+
+			// note that we additionally scale the mantissa to make it a K-bit signed integer (K-1 bits for magnitude)
+			exp -= (bits - 1);
+
+			// compute renormalized rounded mantissa for each component
+			int mmask = (1 << 24) - 1;
+
+			int m = int(v[j] * optexp2(-exp) + (v[j] >= 0 ? 0.5f : -0.5f));

 			d[j] = (m & mmask) | (unsigned(exp) << 24);
 		}