editor updates: male dummy, meshoptimizer, cpp mesh export
This commit is contained in:
+90
-89
@@ -529,102 +529,103 @@ void MeshWindow::Create(EditorComponent* _editor)
|
||||
params.extensions.push_back("h");
|
||||
params.type = wi::helper::FileDialogParams::TYPE::SAVE;
|
||||
wi::helper::FileDialog(params, [=](std::string filename) {
|
||||
|
||||
// Bake transformed and skinned positions:
|
||||
wi::vector<XMFLOAT3> vertices(mesh->vertex_positions.size());
|
||||
const Scene& scene = editor->GetCurrentScene();
|
||||
XMMATRIX M = XMMatrixIdentity();
|
||||
if (editor->componentsWnd.objectWnd.entity != INVALID_ENTITY)
|
||||
{
|
||||
// if first selection is an object then transformation will be also applied
|
||||
Entity object_entity = editor->componentsWnd.objectWnd.entity;
|
||||
const ObjectComponent* object = scene.objects.GetComponent(object_entity);
|
||||
if (object != nullptr)
|
||||
wi::eventhandler::Subscribe_Once(wi::eventhandler::EVENT_THREAD_SAFE_POINT, [=](uint64_t userdata) {
|
||||
// Bake transformed and skinned positions:
|
||||
wi::vector<XMFLOAT3> vertices(mesh->vertex_positions.size());
|
||||
const Scene& scene = editor->GetCurrentScene();
|
||||
XMMATRIX M = XMMatrixIdentity();
|
||||
if (editor->componentsWnd.objectWnd.entity != INVALID_ENTITY)
|
||||
{
|
||||
size_t index = scene.objects.GetIndex(object_entity);
|
||||
M = XMLoadFloat4x4(&scene.matrix_objects[index]);
|
||||
// if first selection is an object then transformation will be also applied
|
||||
Entity object_entity = editor->componentsWnd.objectWnd.entity;
|
||||
const ObjectComponent* object = scene.objects.GetComponent(object_entity);
|
||||
if (object != nullptr)
|
||||
{
|
||||
size_t index = scene.objects.GetIndex(object_entity);
|
||||
M = XMLoadFloat4x4(&scene.matrix_objects[index]);
|
||||
}
|
||||
}
|
||||
}
|
||||
const ArmatureComponent* armature = scene.armatures.GetComponent(mesh->armatureID);
|
||||
for (size_t i = 0; i < mesh->vertex_positions.size(); ++i)
|
||||
{
|
||||
XMVECTOR P;
|
||||
if (armature == nullptr)
|
||||
const ArmatureComponent* armature = scene.armatures.GetComponent(mesh->armatureID);
|
||||
for (size_t i = 0; i < mesh->vertex_positions.size(); ++i)
|
||||
{
|
||||
P = XMLoadFloat3(&mesh->vertex_positions[i]);
|
||||
XMVECTOR P;
|
||||
if (armature == nullptr)
|
||||
{
|
||||
P = XMLoadFloat3(&mesh->vertex_positions[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
P = wi::scene::SkinVertex(*mesh, *armature, (uint32_t)i);
|
||||
}
|
||||
P = XMVector3Transform(P, M);
|
||||
XMStoreFloat3(&vertices[i], P);
|
||||
}
|
||||
|
||||
// Gather all indices for all subsets in LOD0:
|
||||
wi::vector<uint32_t> indices;
|
||||
uint32_t first_subset = 0;
|
||||
uint32_t last_subset = 0;
|
||||
mesh->GetLODSubsetRange(0, first_subset, last_subset);
|
||||
for (uint32_t subsetIndex = first_subset; subsetIndex < last_subset; ++subsetIndex)
|
||||
{
|
||||
const MeshComponent::MeshSubset& subset = mesh->subsets[subsetIndex];
|
||||
if (subset.indexCount == 0)
|
||||
continue;
|
||||
for (uint32_t i = 0; i < subset.indexCount; ++i)
|
||||
{
|
||||
indices.push_back(mesh->indices[subset.indexOffset + i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Generate shadow indices for position-only stream:
|
||||
wi::vector<uint32_t> shadow_indices(indices.size() * 2); // *2 fixes some weird memory oob write issue with a specific model
|
||||
meshopt_generateShadowIndexBuffer(
|
||||
shadow_indices.data(), indices.data(), indices.size(),
|
||||
vertices.data(), vertices.size(), sizeof(XMFLOAT3), sizeof(XMFLOAT3)
|
||||
);
|
||||
|
||||
// De-duplicate vertices based on shadow index buffer:
|
||||
wi::vector<unsigned int> remap(shadow_indices.size());
|
||||
const size_t vertex_count = meshopt_generateVertexRemap(
|
||||
remap.data(),
|
||||
shadow_indices.data(), shadow_indices.size(),
|
||||
vertices.data(), vertices.size(), sizeof(XMFLOAT3)
|
||||
);
|
||||
wi::vector<XMFLOAT3> remapped_vertices(vertex_count);
|
||||
wi::vector<uint32_t> remapped_indices(shadow_indices.size());
|
||||
meshopt_remapIndexBuffer(remapped_indices.data(), shadow_indices.data(), shadow_indices.size(), remap.data());
|
||||
meshopt_remapVertexBuffer(remapped_vertices.data(), vertices.data(), vertices.size() /*initial vertex count, not the one returned from meshopt_generateVertexRemap*/, sizeof(XMFLOAT3), remap.data());
|
||||
|
||||
// Optimizations:
|
||||
meshopt_optimizeVertexCache(remapped_indices.data(), remapped_indices.data(), remapped_indices.size(), vertex_count);
|
||||
meshopt_optimizeVertexFetch(remapped_vertices.data(), remapped_indices.data(), remapped_indices.size(), remapped_vertices.data(), vertex_count, sizeof(XMFLOAT3));
|
||||
|
||||
// Generate C++ header syntax:
|
||||
std::string str;
|
||||
str += "static const float3 vertices[" + std::to_string(remapped_vertices.size()) + "] = {\n";
|
||||
for (auto& pos : remapped_vertices)
|
||||
{
|
||||
str += "\tfloat3(" + std::to_string(pos.x) + "f," + std::to_string(pos.y) + "f," + std::to_string(pos.z) + "f),\n";
|
||||
}
|
||||
str += "};\n";
|
||||
str += "static const unsigned int indices[" + std::to_string(remapped_indices.size()) + "] = {\n";
|
||||
for (size_t i = 0; i < remapped_indices.size(); i += 3)
|
||||
{
|
||||
str += "\t" + std::to_string(remapped_indices[i + 0]) + "," + std::to_string(remapped_indices[i + 1]) + "," + std::to_string(remapped_indices[i + 2]) + ",\n";
|
||||
}
|
||||
str += "};\n";
|
||||
|
||||
// Write to file:
|
||||
std::string filename_dest = wi::helper::ForceExtension(filename, "h");
|
||||
if (wi::helper::FileWrite(filename_dest, (uint8_t*)str.c_str(), str.length()))
|
||||
{
|
||||
editor->PostSaveText("Mesh exported to header file: ", filename_dest);
|
||||
}
|
||||
else
|
||||
{
|
||||
P = wi::scene::SkinVertex(*mesh, *armature, (uint32_t)i);
|
||||
editor->PostSaveText("Failed to write file: ", filename_dest);
|
||||
}
|
||||
P = XMVector3Transform(P, M);
|
||||
XMStoreFloat3(&vertices[i], P);
|
||||
}
|
||||
|
||||
// Gather all indices for all subsets in LOD0:
|
||||
wi::vector<uint32_t> indices;
|
||||
uint32_t first_subset = 0;
|
||||
uint32_t last_subset = 0;
|
||||
mesh->GetLODSubsetRange(0, first_subset, last_subset);
|
||||
for (uint32_t subsetIndex = first_subset; subsetIndex < last_subset; ++subsetIndex)
|
||||
{
|
||||
const MeshComponent::MeshSubset& subset = mesh->subsets[subsetIndex];
|
||||
if (subset.indexCount == 0)
|
||||
continue;
|
||||
for (uint32_t i = 0; i < subset.indexCount; ++i)
|
||||
{
|
||||
indices.push_back(mesh->indices[subset.indexOffset + i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Generate shadow indices for position-only stream:
|
||||
wi::vector<uint32_t> shadow_indices(indices.size());
|
||||
meshopt_generateShadowIndexBuffer(
|
||||
shadow_indices.data(), indices.data(), indices.size(),
|
||||
vertices.data(), vertices.size(), sizeof(XMFLOAT3), sizeof(XMFLOAT3)
|
||||
);
|
||||
|
||||
// De-duplicate vertices based on shadow index buffer:
|
||||
wi::vector<unsigned int> remap(shadow_indices.size());
|
||||
const size_t vertex_count = meshopt_generateVertexRemap(
|
||||
remap.data(),
|
||||
shadow_indices.data(), shadow_indices.size(),
|
||||
vertices.data(), vertices.size(), sizeof(XMFLOAT3)
|
||||
);
|
||||
wi::vector<XMFLOAT3> remapped_vertices(vertex_count);
|
||||
wi::vector<uint32_t> remapped_indices(shadow_indices.size());
|
||||
meshopt_remapIndexBuffer(remapped_indices.data(), shadow_indices.data(), shadow_indices.size(), remap.data());
|
||||
meshopt_remapVertexBuffer(remapped_vertices.data(), vertices.data(), vertices.size() /*initial vertex count, not the one returned from meshopt_generateVertexRemap*/, sizeof(XMFLOAT3), remap.data());
|
||||
|
||||
// Optimizations:
|
||||
meshopt_optimizeVertexCache(remapped_indices.data(), remapped_indices.data(), remapped_indices.size(), vertex_count);
|
||||
meshopt_optimizeVertexFetch(remapped_vertices.data(), remapped_indices.data(), remapped_indices.size(), remapped_vertices.data(), vertex_count, sizeof(XMFLOAT3));
|
||||
|
||||
// Generate C++ header syntax:
|
||||
std::string str;
|
||||
str += "static const float3 vertices[" + std::to_string(remapped_vertices.size()) + "] = {\n";
|
||||
for (auto& pos : remapped_vertices)
|
||||
{
|
||||
str += "\tfloat3(" + std::to_string(pos.x) + "f," + std::to_string(pos.y) + "f," + std::to_string(pos.z) + "f),\n";
|
||||
}
|
||||
str += "};\n";
|
||||
str += "static const unsigned int indices[" + std::to_string(remapped_indices.size()) + "] = {\n";
|
||||
for (size_t i = 0; i < remapped_indices.size(); i += 3)
|
||||
{
|
||||
str += "\t" + std::to_string(remapped_indices[i + 0]) + "," + std::to_string(remapped_indices[i + 1]) + "," + std::to_string(remapped_indices[i + 2]) + ",\n";
|
||||
}
|
||||
str += "};\n";
|
||||
|
||||
// Write to file:
|
||||
filename = wi::helper::ForceExtension(filename, "h");
|
||||
if (wi::helper::FileWrite(filename, (uint8_t*)str.c_str(), str.length()))
|
||||
{
|
||||
editor->PostSaveText("Mesh exported to header file: ", filename);
|
||||
}
|
||||
else
|
||||
{
|
||||
editor->PostSaveText("Failed to write file: ", filename);
|
||||
}
|
||||
});
|
||||
});
|
||||
});
|
||||
AddWidget(&exportHeaderButton);
|
||||
|
||||
+54738
-54825
File diff suppressed because it is too large
Load Diff
@@ -283,6 +283,79 @@ static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int
|
||||
return result;
|
||||
}
|
||||
|
||||
static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone* meshlet_cone, unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const unsigned char* used, float meshlet_expected_radius, float cone_weight, unsigned int* out_extra)
|
||||
{
|
||||
unsigned int best_triangle = ~0u;
|
||||
unsigned int best_extra = 5;
|
||||
float best_score = FLT_MAX;
|
||||
|
||||
for (size_t i = 0; i < meshlet.vertex_count; ++i)
|
||||
{
|
||||
unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];
|
||||
|
||||
unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
|
||||
size_t neighbors_size = adjacency.counts[index];
|
||||
|
||||
for (size_t j = 0; j < neighbors_size; ++j)
|
||||
{
|
||||
unsigned int triangle = neighbors[j];
|
||||
unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
|
||||
|
||||
unsigned int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff);
|
||||
|
||||
// triangles that don't add new vertices to meshlets are max. priority
|
||||
if (extra != 0)
|
||||
{
|
||||
// artificially increase the priority of dangling triangles as they're expensive to add to new meshlets
|
||||
if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)
|
||||
extra = 0;
|
||||
|
||||
extra++;
|
||||
}
|
||||
|
||||
// since topology-based priority is always more important than the score, we can skip scoring in some cases
|
||||
if (extra > best_extra)
|
||||
continue;
|
||||
|
||||
float score = 0;
|
||||
|
||||
// caller selects one of two scoring functions: geometrical (based on meshlet cone) or topological (based on remaining triangles)
|
||||
if (meshlet_cone)
|
||||
{
|
||||
const Cone& tri_cone = triangles[triangle];
|
||||
|
||||
float distance2 =
|
||||
(tri_cone.px - meshlet_cone->px) * (tri_cone.px - meshlet_cone->px) +
|
||||
(tri_cone.py - meshlet_cone->py) * (tri_cone.py - meshlet_cone->py) +
|
||||
(tri_cone.pz - meshlet_cone->pz) * (tri_cone.pz - meshlet_cone->pz);
|
||||
|
||||
float spread = tri_cone.nx * meshlet_cone->nx + tri_cone.ny * meshlet_cone->ny + tri_cone.nz * meshlet_cone->nz;
|
||||
|
||||
score = getMeshletScore(distance2, spread, cone_weight, meshlet_expected_radius);
|
||||
}
|
||||
else
|
||||
{
|
||||
// each live_triangles entry is >= 1 since it includes the current triangle we're processing
|
||||
score = float(live_triangles[a] + live_triangles[b] + live_triangles[c] - 3);
|
||||
}
|
||||
|
||||
// note that topology-based priority is always more important than the score
|
||||
// this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost
|
||||
if (extra < best_extra || score < best_score)
|
||||
{
|
||||
best_triangle = triangle;
|
||||
best_extra = extra;
|
||||
best_score = score;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (out_extra)
|
||||
*out_extra = best_extra;
|
||||
|
||||
return best_triangle;
|
||||
}
|
||||
|
||||
struct KDNode
|
||||
{
|
||||
union
|
||||
@@ -464,13 +537,15 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
|
||||
using namespace meshopt;
|
||||
|
||||
assert(index_count % 3 == 0);
|
||||
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride % sizeof(float) == 0);
|
||||
|
||||
assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
|
||||
assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
|
||||
assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
|
||||
|
||||
assert(cone_weight >= 0 && cone_weight <= 1);
|
||||
|
||||
meshopt_Allocator allocator;
|
||||
|
||||
TriangleAdjacency2 adjacency = {};
|
||||
@@ -511,65 +586,18 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
|
||||
|
||||
for (;;)
|
||||
{
|
||||
unsigned int best_triangle = ~0u;
|
||||
unsigned int best_extra = 5;
|
||||
float best_score = FLT_MAX;
|
||||
|
||||
Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count);
|
||||
|
||||
for (size_t i = 0; i < meshlet.vertex_count; ++i)
|
||||
unsigned int best_extra = 0;
|
||||
unsigned int best_triangle = getNeighborTriangle(meshlet, &meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight, &best_extra);
|
||||
|
||||
// if the best triangle doesn't fit into current meshlet, the spatial scoring we've used is not very meaningful, so we re-select using topological scoring
|
||||
if (best_triangle != ~0u && (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles))
|
||||
{
|
||||
unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];
|
||||
|
||||
unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index];
|
||||
size_t neighbours_size = adjacency.counts[index];
|
||||
|
||||
for (size_t j = 0; j < neighbours_size; ++j)
|
||||
{
|
||||
unsigned int triangle = neighbours[j];
|
||||
assert(!emitted_flags[triangle]);
|
||||
|
||||
unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
|
||||
assert(a < vertex_count && b < vertex_count && c < vertex_count);
|
||||
|
||||
unsigned int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff);
|
||||
|
||||
// triangles that don't add new vertices to meshlets are max. priority
|
||||
if (extra != 0)
|
||||
{
|
||||
// artificially increase the priority of dangling triangles as they're expensive to add to new meshlets
|
||||
if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)
|
||||
extra = 0;
|
||||
|
||||
extra++;
|
||||
}
|
||||
|
||||
// since topology-based priority is always more important than the score, we can skip scoring in some cases
|
||||
if (extra > best_extra)
|
||||
continue;
|
||||
|
||||
const Cone& tri_cone = triangles[triangle];
|
||||
|
||||
float distance2 =
|
||||
(tri_cone.px - meshlet_cone.px) * (tri_cone.px - meshlet_cone.px) +
|
||||
(tri_cone.py - meshlet_cone.py) * (tri_cone.py - meshlet_cone.py) +
|
||||
(tri_cone.pz - meshlet_cone.pz) * (tri_cone.pz - meshlet_cone.pz);
|
||||
|
||||
float spread = tri_cone.nx * meshlet_cone.nx + tri_cone.ny * meshlet_cone.ny + tri_cone.nz * meshlet_cone.nz;
|
||||
|
||||
float score = getMeshletScore(distance2, spread, cone_weight, meshlet_expected_radius);
|
||||
|
||||
// note that topology-based priority is always more important than the score
|
||||
// this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost
|
||||
if (extra < best_extra || score < best_score)
|
||||
{
|
||||
best_triangle = triangle;
|
||||
best_extra = extra;
|
||||
best_score = score;
|
||||
}
|
||||
}
|
||||
best_triangle = getNeighborTriangle(meshlet, NULL, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, 0.f, NULL);
|
||||
}
|
||||
|
||||
// when we run out of neighboring triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity
|
||||
if (best_triangle == ~0u)
|
||||
{
|
||||
float position[3] = {meshlet_cone.px, meshlet_cone.py, meshlet_cone.pz};
|
||||
@@ -604,16 +632,16 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
|
||||
{
|
||||
unsigned int index = indices[best_triangle * 3 + k];
|
||||
|
||||
unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index];
|
||||
size_t neighbours_size = adjacency.counts[index];
|
||||
unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
|
||||
size_t neighbors_size = adjacency.counts[index];
|
||||
|
||||
for (size_t i = 0; i < neighbours_size; ++i)
|
||||
for (size_t i = 0; i < neighbors_size; ++i)
|
||||
{
|
||||
unsigned int tri = neighbours[i];
|
||||
unsigned int tri = neighbors[i];
|
||||
|
||||
if (tri == best_triangle)
|
||||
{
|
||||
neighbours[i] = neighbours[neighbours_size - 1];
|
||||
neighbors[i] = neighbors[neighbors_size - 1];
|
||||
adjacency.counts[index]--;
|
||||
break;
|
||||
}
|
||||
@@ -687,7 +715,7 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t
|
||||
|
||||
assert(index_count % 3 == 0);
|
||||
assert(index_count / 3 <= kMeshletMaxTriangles);
|
||||
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride % sizeof(float) == 0);
|
||||
|
||||
(void)vertex_count;
|
||||
@@ -839,7 +867,7 @@ meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices
|
||||
using namespace meshopt;
|
||||
|
||||
assert(triangle_count <= kMeshletMaxTriangles);
|
||||
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride % sizeof(float) == 0);
|
||||
|
||||
unsigned int indices[kMeshletMaxTriangles * 3];
|
||||
|
||||
@@ -13,7 +13,7 @@ namespace meshopt
|
||||
const unsigned char kIndexHeader = 0xe0;
|
||||
const unsigned char kSequenceHeader = 0xd0;
|
||||
|
||||
static int gEncodeIndexVersion = 0;
|
||||
static int gEncodeIndexVersion = 1;
|
||||
|
||||
typedef unsigned int VertexFifo[16];
|
||||
typedef unsigned int EdgeFifo[16][2];
|
||||
|
||||
@@ -157,7 +157,7 @@ static T* hashLookup(T* table, size_t buckets, const Hash& hash, const T& key, c
|
||||
}
|
||||
|
||||
assert(false && "Hash table is full"); // unreachable
|
||||
return 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void buildPositionRemap(unsigned int* remap, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, meshopt_Allocator& allocator)
|
||||
@@ -178,6 +178,22 @@ static void buildPositionRemap(unsigned int* remap, const float* vertex_position
|
||||
|
||||
remap[index] = *entry;
|
||||
}
|
||||
|
||||
allocator.deallocate(vertex_table);
|
||||
}
|
||||
|
||||
template <size_t BlockSize>
|
||||
static void remapVertices(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
|
||||
{
|
||||
size_t block_size = BlockSize == 0 ? vertex_size : BlockSize;
|
||||
assert(block_size == vertex_size);
|
||||
|
||||
for (size_t i = 0; i < vertex_count; ++i)
|
||||
if (remap[i] != ~0u)
|
||||
{
|
||||
assert(remap[i] < vertex_count);
|
||||
memcpy(static_cast<unsigned char*>(destination) + remap[i] * block_size, static_cast<const unsigned char*>(vertices) + i * block_size, block_size);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace meshopt
|
||||
@@ -187,7 +203,7 @@ size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int
|
||||
using namespace meshopt;
|
||||
|
||||
assert(indices || index_count == vertex_count);
|
||||
assert(index_count % 3 == 0);
|
||||
assert(!indices || index_count % 3 == 0);
|
||||
assert(vertex_size > 0 && vertex_size <= 256);
|
||||
|
||||
meshopt_Allocator allocator;
|
||||
@@ -288,6 +304,8 @@ size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigne
|
||||
|
||||
void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
|
||||
{
|
||||
using namespace meshopt;
|
||||
|
||||
assert(vertex_size > 0 && vertex_size <= 256);
|
||||
|
||||
meshopt_Allocator allocator;
|
||||
@@ -300,14 +318,23 @@ void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t v
|
||||
vertices = vertices_copy;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < vertex_count; ++i)
|
||||
// specialize the loop for common vertex sizes to ensure memcpy is compiled as an inlined intrinsic
|
||||
switch (vertex_size)
|
||||
{
|
||||
if (remap[i] != ~0u)
|
||||
{
|
||||
assert(remap[i] < vertex_count);
|
||||
case 4:
|
||||
return remapVertices<4>(destination, vertices, vertex_count, vertex_size, remap);
|
||||
|
||||
memcpy(static_cast<unsigned char*>(destination) + remap[i] * vertex_size, static_cast<const unsigned char*>(vertices) + i * vertex_size, vertex_size);
|
||||
}
|
||||
case 8:
|
||||
return remapVertices<8>(destination, vertices, vertex_count, vertex_size, remap);
|
||||
|
||||
case 12:
|
||||
return remapVertices<12>(destination, vertices, vertex_count, vertex_size, remap);
|
||||
|
||||
case 16:
|
||||
return remapVertices<16>(destination, vertices, vertex_count, vertex_size, remap);
|
||||
|
||||
default:
|
||||
return remapVertices<0>(destination, vertices, vertex_count, vertex_size, remap);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -412,7 +439,7 @@ void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsig
|
||||
using namespace meshopt;
|
||||
|
||||
assert(index_count % 3 == 0);
|
||||
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride % sizeof(float) == 0);
|
||||
|
||||
meshopt_Allocator allocator;
|
||||
@@ -483,7 +510,7 @@ void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const un
|
||||
using namespace meshopt;
|
||||
|
||||
assert(index_count % 3 == 0);
|
||||
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride % sizeof(float) == 0);
|
||||
|
||||
meshopt_Allocator allocator;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -147,7 +147,7 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices,
|
||||
using namespace meshopt;
|
||||
|
||||
assert(index_count % 3 == 0);
|
||||
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride % sizeof(float) == 0);
|
||||
|
||||
meshopt_Allocator allocator;
|
||||
|
||||
@@ -272,7 +272,7 @@ void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* ind
|
||||
using namespace meshopt;
|
||||
|
||||
assert(index_count % 3 == 0);
|
||||
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride % sizeof(float) == 0);
|
||||
|
||||
meshopt_Allocator allocator;
|
||||
|
||||
@@ -0,0 +1,70 @@
|
||||
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
|
||||
#include "meshoptimizer.h"
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
unsigned short meshopt_quantizeHalf(float v)
|
||||
{
|
||||
union { float f; unsigned int ui; } u = {v};
|
||||
unsigned int ui = u.ui;
|
||||
|
||||
int s = (ui >> 16) & 0x8000;
|
||||
int em = ui & 0x7fffffff;
|
||||
|
||||
// bias exponent and round to nearest; 112 is relative exponent bias (127-15)
|
||||
int h = (em - (112 << 23) + (1 << 12)) >> 13;
|
||||
|
||||
// underflow: flush to zero; 113 encodes exponent -14
|
||||
h = (em < (113 << 23)) ? 0 : h;
|
||||
|
||||
// overflow: infinity; 143 encodes exponent 16
|
||||
h = (em >= (143 << 23)) ? 0x7c00 : h;
|
||||
|
||||
// NaN; note that we convert all types of NaN to qNaN
|
||||
h = (em > (255 << 23)) ? 0x7e00 : h;
|
||||
|
||||
return (unsigned short)(s | h);
|
||||
}
|
||||
|
||||
float meshopt_quantizeFloat(float v, int N)
|
||||
{
|
||||
assert(N >= 0 && N <= 23);
|
||||
|
||||
union { float f; unsigned int ui; } u = {v};
|
||||
unsigned int ui = u.ui;
|
||||
|
||||
const int mask = (1 << (23 - N)) - 1;
|
||||
const int round = (1 << (23 - N)) >> 1;
|
||||
|
||||
int e = ui & 0x7f800000;
|
||||
unsigned int rui = (ui + round) & ~mask;
|
||||
|
||||
// round all numbers except inf/nan; this is important to make sure nan doesn't overflow into -0
|
||||
ui = e == 0x7f800000 ? ui : rui;
|
||||
|
||||
// flush denormals to zero
|
||||
ui = e == 0 ? 0 : ui;
|
||||
|
||||
u.ui = ui;
|
||||
return u.f;
|
||||
}
|
||||
|
||||
float meshopt_dequantizeHalf(unsigned short h)
|
||||
{
|
||||
unsigned int s = unsigned(h & 0x8000) << 16;
|
||||
int em = h & 0x7fff;
|
||||
|
||||
// bias exponent and pad mantissa with 0; 112 is relative exponent bias (127-15)
|
||||
int r = (em + (112 << 10)) << 13;
|
||||
|
||||
// denormal: flush to zero
|
||||
r = (em < (1 << 10)) ? 0 : r;
|
||||
|
||||
// infinity/NaN; note that we preserve NaN payload as a byproduct of unifying inf/nan cases
|
||||
// 112 is an exponent bias fixup; since we already applied it once, applying it twice converts 31 to 255
|
||||
r += (em >= (31 << 10)) ? (112 << 23) : 0;
|
||||
|
||||
union { float f; unsigned int ui; } u;
|
||||
u.ui = s | r;
|
||||
return u.f;
|
||||
}
|
||||
+392
-147
File diff suppressed because it is too large
Load Diff
@@ -113,7 +113,7 @@ void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_pos
|
||||
{
|
||||
using namespace meshopt;
|
||||
|
||||
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride % sizeof(float) == 0);
|
||||
|
||||
meshopt_Allocator allocator;
|
||||
@@ -144,7 +144,7 @@ void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int*
|
||||
using namespace meshopt;
|
||||
|
||||
assert(index_count % 3 == 0);
|
||||
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride % sizeof(float) == 0);
|
||||
|
||||
(void)vertex_count;
|
||||
|
||||
@@ -110,7 +110,7 @@ static unsigned int getNextVertexDeadEnd(const unsigned int* dead_end, unsigned
|
||||
return ~0u;
|
||||
}
|
||||
|
||||
static unsigned int getNextVertexNeighbour(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size)
|
||||
static unsigned int getNextVertexNeighbor(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size)
|
||||
{
|
||||
unsigned int best_candidate = ~0u;
|
||||
int best_priority = -1;
|
||||
@@ -221,9 +221,9 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
|
||||
triangle_scores[i] = vertex_scores[a] + vertex_scores[b] + vertex_scores[c];
|
||||
}
|
||||
|
||||
unsigned int cache_holder[2 * (kCacheSizeMax + 3)];
|
||||
unsigned int cache_holder[2 * (kCacheSizeMax + 4)];
|
||||
unsigned int* cache = cache_holder;
|
||||
unsigned int* cache_new = cache_holder + kCacheSizeMax + 3;
|
||||
unsigned int* cache_new = cache_holder + kCacheSizeMax + 4;
|
||||
size_t cache_count = 0;
|
||||
|
||||
unsigned int current_triangle = 0;
|
||||
@@ -260,10 +260,8 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
|
||||
{
|
||||
unsigned int index = cache[i];
|
||||
|
||||
if (index != a && index != b && index != c)
|
||||
{
|
||||
cache_new[cache_write++] = index;
|
||||
}
|
||||
cache_new[cache_write] = index;
|
||||
cache_write += (index != a && index != b && index != c);
|
||||
}
|
||||
|
||||
unsigned int* cache_temp = cache;
|
||||
@@ -281,16 +279,16 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
|
||||
{
|
||||
unsigned int index = indices[current_triangle * 3 + k];
|
||||
|
||||
unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index];
|
||||
size_t neighbours_size = adjacency.counts[index];
|
||||
unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
|
||||
size_t neighbors_size = adjacency.counts[index];
|
||||
|
||||
for (size_t i = 0; i < neighbours_size; ++i)
|
||||
for (size_t i = 0; i < neighbors_size; ++i)
|
||||
{
|
||||
unsigned int tri = neighbours[i];
|
||||
unsigned int tri = neighbors[i];
|
||||
|
||||
if (tri == current_triangle)
|
||||
{
|
||||
neighbours[i] = neighbours[neighbours_size - 1];
|
||||
neighbors[i] = neighbors[neighbors_size - 1];
|
||||
adjacency.counts[index]--;
|
||||
break;
|
||||
}
|
||||
@@ -305,6 +303,10 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
|
||||
{
|
||||
unsigned int index = cache[i];
|
||||
|
||||
// no need to update scores if we are never going to use this vertex
|
||||
if (adjacency.counts[index] == 0)
|
||||
continue;
|
||||
|
||||
int cache_position = i >= cache_size ? -1 : int(i);
|
||||
|
||||
// update vertex score
|
||||
@@ -314,10 +316,10 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
|
||||
vertex_scores[index] = score;
|
||||
|
||||
// update scores of vertex triangles
|
||||
const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[index];
|
||||
const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[index];
|
||||
const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[index];
|
||||
const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[index];
|
||||
|
||||
for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it)
|
||||
for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it)
|
||||
{
|
||||
unsigned int tri = *it;
|
||||
assert(!emitted_flags[tri]);
|
||||
@@ -325,11 +327,8 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
|
||||
float tri_score = triangle_scores[tri] + score_diff;
|
||||
assert(tri_score > 0);
|
||||
|
||||
if (best_score < tri_score)
|
||||
{
|
||||
best_triangle = tri;
|
||||
best_score = tri_score;
|
||||
}
|
||||
best_triangle = best_score < tri_score ? tri : best_triangle;
|
||||
best_score = best_score < tri_score ? tri_score : best_score;
|
||||
|
||||
triangle_scores[tri] = tri_score;
|
||||
}
|
||||
@@ -412,11 +411,11 @@ void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned i
|
||||
{
|
||||
const unsigned int* next_candidates_begin = &dead_end[0] + dead_end_top;
|
||||
|
||||
// emit all vertex neighbours
|
||||
const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[current_vertex];
|
||||
const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[current_vertex];
|
||||
// emit all vertex neighbors
|
||||
const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[current_vertex];
|
||||
const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[current_vertex];
|
||||
|
||||
for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it)
|
||||
for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it)
|
||||
{
|
||||
unsigned int triangle = *it;
|
||||
|
||||
@@ -461,7 +460,7 @@ void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned i
|
||||
const unsigned int* next_candidates_end = &dead_end[0] + dead_end_top;
|
||||
|
||||
// get next vertex
|
||||
current_vertex = getNextVertexNeighbour(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size);
|
||||
current_vertex = getNextVertexNeighbor(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size);
|
||||
|
||||
if (current_vertex == ~0u)
|
||||
{
|
||||
|
||||
@@ -44,12 +44,22 @@
|
||||
// When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
|
||||
#if defined(__wasm_simd128__)
|
||||
#define SIMD_WASM
|
||||
// Prevent compiling other variant when wasm simd compilation is active
|
||||
#undef SIMD_NEON
|
||||
#undef SIMD_SSE
|
||||
#undef SIMD_AVX
|
||||
#endif
|
||||
|
||||
#ifndef SIMD_TARGET
|
||||
#define SIMD_TARGET
|
||||
#endif
|
||||
|
||||
// When targeting AArch64/x64, optimize for latency to allow decoding of individual 16-byte groups to overlap
|
||||
// We don't do this for 32-bit systems because we need 64-bit math for this and this will hurt in-order CPUs
|
||||
#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64)
|
||||
#define SIMD_LATENCYOPT
|
||||
#endif
|
||||
|
||||
#endif // !MESHOPTIMIZER_NO_SIMD
|
||||
|
||||
#ifdef SIMD_SSE
|
||||
@@ -77,19 +87,17 @@
|
||||
#endif
|
||||
|
||||
#ifdef SIMD_WASM
|
||||
#undef __DEPRECATED
|
||||
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
||||
#include <wasm_simd128.h>
|
||||
#endif
|
||||
|
||||
#ifdef SIMD_WASM
|
||||
#define wasmx_splat_v32x4(v, i) wasm_v32x4_shuffle(v, v, i, i, i, i)
|
||||
#define wasmx_unpacklo_v8x16(a, b) wasm_v8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
|
||||
#define wasmx_unpackhi_v8x16(a, b) wasm_v8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
|
||||
#define wasmx_unpacklo_v16x8(a, b) wasm_v16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
|
||||
#define wasmx_unpackhi_v16x8(a, b) wasm_v16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
|
||||
#define wasmx_unpacklo_v64x2(a, b) wasm_v64x2_shuffle(a, b, 0, 2)
|
||||
#define wasmx_unpackhi_v64x2(a, b) wasm_v64x2_shuffle(a, b, 1, 3)
|
||||
#define wasmx_splat_v32x4(v, i) wasm_i32x4_shuffle(v, v, i, i, i, i)
|
||||
#define wasmx_unpacklo_v8x16(a, b) wasm_i8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
|
||||
#define wasmx_unpackhi_v8x16(a, b) wasm_i8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
|
||||
#define wasmx_unpacklo_v16x8(a, b) wasm_i16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
|
||||
#define wasmx_unpackhi_v16x8(a, b) wasm_i16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
|
||||
#define wasmx_unpacklo_v64x2(a, b) wasm_i64x2_shuffle(a, b, 0, 2)
|
||||
#define wasmx_unpackhi_v64x2(a, b) wasm_i64x2_shuffle(a, b, 1, 3)
|
||||
#endif
|
||||
|
||||
namespace meshopt
|
||||
@@ -212,7 +220,7 @@ static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end,
|
||||
size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
|
||||
|
||||
if (size_t(data_end - data) < header_size)
|
||||
return 0;
|
||||
return NULL;
|
||||
|
||||
data += header_size;
|
||||
|
||||
@@ -221,7 +229,7 @@ static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end,
|
||||
for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
|
||||
{
|
||||
if (size_t(data_end - data) < kByteGroupDecodeLimit)
|
||||
return 0;
|
||||
return NULL;
|
||||
|
||||
int best_bits = 8;
|
||||
size_t best_size = encodeBytesGroupMeasure(buffer + i, 8);
|
||||
@@ -280,7 +288,7 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data
|
||||
|
||||
data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1));
|
||||
if (!data)
|
||||
return 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
|
||||
@@ -288,7 +296,7 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data
|
||||
return data;
|
||||
}
|
||||
|
||||
#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX))
|
||||
#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX) && !defined(SIMD_WASM))
|
||||
static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
|
||||
{
|
||||
#define READ() byte = *data++
|
||||
@@ -348,14 +356,14 @@ static const unsigned char* decodeBytes(const unsigned char* data, const unsigne
|
||||
size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
|
||||
|
||||
if (size_t(data_end - data) < header_size)
|
||||
return 0;
|
||||
return NULL;
|
||||
|
||||
data += header_size;
|
||||
|
||||
for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
|
||||
{
|
||||
if (size_t(data_end - data) < kByteGroupDecodeLimit)
|
||||
return 0;
|
||||
return NULL;
|
||||
|
||||
size_t header_offset = i / kByteGroupSize;
|
||||
|
||||
@@ -380,7 +388,7 @@ static const unsigned char* decodeVertexBlock(const unsigned char* data, const u
|
||||
{
|
||||
data = decodeBytes(data, data_end, buffer, vertex_count_aligned);
|
||||
if (!data)
|
||||
return 0;
|
||||
return NULL;
|
||||
|
||||
size_t vertex_offset = k;
|
||||
|
||||
@@ -472,6 +480,18 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
|
||||
typedef int unaligned_int;
|
||||
#endif
|
||||
|
||||
#ifdef SIMD_LATENCYOPT
|
||||
unsigned int data32;
|
||||
memcpy(&data32, data, 4);
|
||||
data32 &= data32 >> 1;
|
||||
|
||||
// arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
|
||||
unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
|
||||
|
||||
// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
|
||||
int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
|
||||
#endif
|
||||
|
||||
__m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data));
|
||||
__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4));
|
||||
|
||||
@@ -490,11 +510,25 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
|
||||
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
|
||||
|
||||
#ifdef SIMD_LATENCYOPT
|
||||
return data + 4 + datacnt;
|
||||
#else
|
||||
return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
|
||||
#endif
|
||||
}
|
||||
|
||||
case 2:
|
||||
{
|
||||
#ifdef SIMD_LATENCYOPT
|
||||
unsigned long long data64;
|
||||
memcpy(&data64, data, 8);
|
||||
data64 &= data64 >> 1;
|
||||
data64 &= data64 >> 2;
|
||||
|
||||
// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
|
||||
int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
|
||||
#endif
|
||||
|
||||
__m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
|
||||
__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8));
|
||||
|
||||
@@ -512,7 +546,11 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
|
||||
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
|
||||
|
||||
#ifdef SIMD_LATENCYOPT
|
||||
return data + 8 + datacnt;
|
||||
#else
|
||||
return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
|
||||
#endif
|
||||
}
|
||||
|
||||
case 3:
|
||||
@@ -604,24 +642,13 @@ static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8
|
||||
|
||||
static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
|
||||
{
|
||||
static const unsigned char byte_mask_data[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128};
|
||||
// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
|
||||
const uint64_t magic = 0x000103070f1f3f80ull;
|
||||
|
||||
uint8x16_t byte_mask = vld1q_u8(byte_mask_data);
|
||||
uint8x16_t masked = vandq_u8(mask, byte_mask);
|
||||
uint64x2_t mask2 = vreinterpretq_u64_u8(mask);
|
||||
|
||||
#ifdef __aarch64__
|
||||
// aarch64 has horizontal sums; MSVC doesn't expose this via arm64_neon.h so this path is exclusive to clang/gcc
|
||||
mask0 = vaddv_u8(vget_low_u8(masked));
|
||||
mask1 = vaddv_u8(vget_high_u8(masked));
|
||||
#else
|
||||
// we need horizontal sums of each half of masked, which can be done in 3 steps (yielding sums of sizes 2, 4, 8)
|
||||
uint8x8_t sum1 = vpadd_u8(vget_low_u8(masked), vget_high_u8(masked));
|
||||
uint8x8_t sum2 = vpadd_u8(sum1, sum1);
|
||||
uint8x8_t sum3 = vpadd_u8(sum2, sum2);
|
||||
|
||||
mask0 = vget_lane_u8(sum3, 0);
|
||||
mask1 = vget_lane_u8(sum3, 1);
|
||||
#endif
|
||||
mask0 = uint8_t((vgetq_lane_u64(mask2, 0) * magic) >> 56);
|
||||
mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56);
|
||||
}
|
||||
|
||||
static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
|
||||
@@ -639,6 +666,18 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
|
||||
|
||||
case 1:
|
||||
{
|
||||
#ifdef SIMD_LATENCYOPT
|
||||
unsigned int data32;
|
||||
memcpy(&data32, data, 4);
|
||||
data32 &= data32 >> 1;
|
||||
|
||||
// arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
|
||||
unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
|
||||
|
||||
// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
|
||||
int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
|
||||
#endif
|
||||
|
||||
uint8x8_t sel2 = vld1_u8(data);
|
||||
uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0];
|
||||
uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22);
|
||||
@@ -655,11 +694,25 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
|
||||
|
||||
vst1q_u8(buffer, result);
|
||||
|
||||
#ifdef SIMD_LATENCYOPT
|
||||
return data + 4 + datacnt;
|
||||
#else
|
||||
return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
|
||||
#endif
|
||||
}
|
||||
|
||||
case 2:
|
||||
{
|
||||
#ifdef SIMD_LATENCYOPT
|
||||
unsigned long long data64;
|
||||
memcpy(&data64, data, 8);
|
||||
data64 &= data64 >> 1;
|
||||
data64 &= data64 >> 2;
|
||||
|
||||
// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
|
||||
int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
|
||||
#endif
|
||||
|
||||
uint8x8_t sel4 = vld1_u8(data);
|
||||
uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15)));
|
||||
uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]);
|
||||
@@ -675,7 +728,11 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
|
||||
|
||||
vst1q_u8(buffer, result);
|
||||
|
||||
#ifdef SIMD_LATENCYOPT
|
||||
return data + 8 + datacnt;
|
||||
#else
|
||||
return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
|
||||
#endif
|
||||
}
|
||||
|
||||
case 3:
|
||||
@@ -702,7 +759,7 @@ static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
|
||||
v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);
|
||||
|
||||
v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]);
|
||||
sm1off = wasm_v8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
sm1off = wasm_i8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
|
||||
v128_t sm1r = wasm_i8x16_add(sm1, sm1off);
|
||||
|
||||
@@ -715,7 +772,6 @@ static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1
|
||||
// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
|
||||
const uint64_t magic = 0x000103070f1f3f80ull;
|
||||
|
||||
// TODO: This can use v8x16_bitmask in the future
|
||||
mask0 = uint8_t((wasm_i64x2_extract_lane(mask, 0) * magic) >> 56);
|
||||
mask1 = uint8_t((wasm_i64x2_extract_lane(mask, 1) * magic) >> 56);
|
||||
}
|
||||
@@ -723,9 +779,6 @@ static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1
|
||||
SIMD_TARGET
|
||||
static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
|
||||
{
|
||||
unsigned char byte, enc, encv;
|
||||
const unsigned char* data_var;
|
||||
|
||||
switch (bitslog2)
|
||||
{
|
||||
case 0:
|
||||
@@ -753,7 +806,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
|
||||
|
||||
v128_t shuf = decodeShuffleMask(mask0, mask1);
|
||||
|
||||
v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask);
|
||||
v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
|
||||
|
||||
wasm_v128_store(buffer, result);
|
||||
|
||||
@@ -775,7 +828,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
|
||||
|
||||
v128_t shuf = decodeShuffleMask(mask0, mask1);
|
||||
|
||||
v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask);
|
||||
v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
|
||||
|
||||
wasm_v128_store(buffer, result);
|
||||
|
||||
@@ -885,7 +938,7 @@ static const unsigned char* decodeBytesSimd(const unsigned char* data, const uns
|
||||
size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
|
||||
|
||||
if (size_t(data_end - data) < header_size)
|
||||
return 0;
|
||||
return NULL;
|
||||
|
||||
data += header_size;
|
||||
|
||||
@@ -907,7 +960,7 @@ static const unsigned char* decodeBytesSimd(const unsigned char* data, const uns
|
||||
for (; i < buffer_size; i += kByteGroupSize)
|
||||
{
|
||||
if (size_t(data_end - data) < kByteGroupDecodeLimit)
|
||||
return 0;
|
||||
return NULL;
|
||||
|
||||
size_t header_offset = i / kByteGroupSize;
|
||||
|
||||
@@ -935,7 +988,7 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con
|
||||
{
|
||||
data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned);
|
||||
if (!data)
|
||||
return 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#if defined(SIMD_SSE) || defined(SIMD_AVX)
|
||||
@@ -1129,7 +1182,7 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
|
||||
assert(vertex_size > 0 && vertex_size <= 256);
|
||||
assert(vertex_size % 4 == 0);
|
||||
|
||||
const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = 0;
|
||||
const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = NULL;
|
||||
|
||||
#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
|
||||
decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
|
||||
|
||||
@@ -30,6 +30,9 @@
|
||||
// When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
|
||||
#if defined(__wasm_simd128__)
|
||||
#define SIMD_WASM
|
||||
// Prevent compiling other variant when wasm simd compilation is active
|
||||
#undef SIMD_NEON
|
||||
#undef SIMD_SSE
|
||||
#endif
|
||||
|
||||
#endif // !MESHOPTIMIZER_NO_SIMD
|
||||
@@ -63,6 +66,10 @@
|
||||
#define wasmx_unziphi_v32x4(a, b) wasm_v32x4_shuffle(a, b, 1, 3, 5, 7)
|
||||
#endif
|
||||
|
||||
#ifndef __has_builtin
|
||||
#define __has_builtin(x) 0
|
||||
#endif
|
||||
|
||||
namespace meshopt
|
||||
{
|
||||
|
||||
@@ -185,9 +192,7 @@ inline uint64_t rotateleft64(uint64_t v, int x)
|
||||
{
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
return _rotl64(v, x);
|
||||
// Apple's Clang 8 is actually vanilla Clang 3.9, there we need to look for
|
||||
// version 11 instead: https://en.wikipedia.org/wiki/Xcode#Toolchain_versions
|
||||
#elif defined(__clang__) && ((!defined(__apple_build_version__) && __clang_major__ >= 8) || __clang_major__ >= 11)
|
||||
#elif defined(__clang__) && __has_builtin(__builtin_rotateleft64)
|
||||
return __builtin_rotateleft64(v, x);
|
||||
#else
|
||||
return (v << (x & 63)) | (v >> ((64 - x) & 63));
|
||||
@@ -791,6 +796,33 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count)
|
||||
}
|
||||
#endif
|
||||
|
||||
// optimized variant of frexp
|
||||
inline int optlog2(float v)
|
||||
{
|
||||
union
|
||||
{
|
||||
float f;
|
||||
unsigned int ui;
|
||||
} u;
|
||||
|
||||
u.f = v;
|
||||
// +1 accounts for implicit 1. in mantissa; denormalized numbers will end up clamped to min_exp by calling code
|
||||
return u.ui == 0 ? 0 : int((u.ui >> 23) & 0xff) - 127 + 1;
|
||||
}
|
||||
|
||||
// optimized variant of ldexp
|
||||
inline float optexp2(int e)
|
||||
{
|
||||
union
|
||||
{
|
||||
float f;
|
||||
unsigned int ui;
|
||||
} u;
|
||||
|
||||
u.ui = unsigned(e + 127) << 23;
|
||||
return u.f;
|
||||
}
|
||||
|
||||
} // namespace meshopt
|
||||
|
||||
void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride)
|
||||
@@ -918,39 +950,78 @@ void meshopt_encodeFilterQuat(void* destination_, size_t count, size_t stride, i
|
||||
}
|
||||
}
|
||||
|
||||
void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, int bits, const float* data)
|
||||
void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode)
|
||||
{
|
||||
assert(stride > 0 && stride % 4 == 0);
|
||||
using namespace meshopt;
|
||||
|
||||
assert(stride > 0 && stride % 4 == 0 && stride <= 256);
|
||||
assert(bits >= 1 && bits <= 24);
|
||||
|
||||
unsigned int* destination = static_cast<unsigned int*>(destination_);
|
||||
size_t stride_float = stride / sizeof(float);
|
||||
|
||||
int component_exp[64];
|
||||
assert(stride_float <= sizeof(component_exp) / sizeof(int));
|
||||
|
||||
const int min_exp = -100;
|
||||
|
||||
if (mode == meshopt_EncodeExpSharedComponent)
|
||||
{
|
||||
for (size_t j = 0; j < stride_float; ++j)
|
||||
component_exp[j] = min_exp;
|
||||
|
||||
for (size_t i = 0; i < count; ++i)
|
||||
{
|
||||
const float* v = &data[i * stride_float];
|
||||
|
||||
// use maximum exponent to encode values; this guarantees that mantissa is [-1, 1]
|
||||
for (size_t j = 0; j < stride_float; ++j)
|
||||
{
|
||||
int e = optlog2(v[j]);
|
||||
|
||||
component_exp[j] = (component_exp[j] < e) ? e : component_exp[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < count; ++i)
|
||||
{
|
||||
const float* v = &data[i * stride_float];
|
||||
unsigned int* d = &destination[i * stride_float];
|
||||
|
||||
// use maximum exponent to encode values; this guarantess that mantissa is [-1, 1]
|
||||
int exp = -100;
|
||||
int vector_exp = min_exp;
|
||||
|
||||
for (size_t j = 0; j < stride_float; ++j)
|
||||
if (mode == meshopt_EncodeExpSharedVector)
|
||||
{
|
||||
int e;
|
||||
frexp(v[j], &e);
|
||||
// use maximum exponent to encode values; this guarantees that mantissa is [-1, 1]
|
||||
for (size_t j = 0; j < stride_float; ++j)
|
||||
{
|
||||
int e = optlog2(v[j]);
|
||||
|
||||
exp = (exp < e) ? e : exp;
|
||||
vector_exp = (vector_exp < e) ? e : vector_exp;
|
||||
}
|
||||
}
|
||||
else if (mode == meshopt_EncodeExpSeparate)
|
||||
{
|
||||
for (size_t j = 0; j < stride_float; ++j)
|
||||
{
|
||||
int e = optlog2(v[j]);
|
||||
|
||||
component_exp[j] = (min_exp < e) ? e : min_exp;
|
||||
}
|
||||
}
|
||||
|
||||
// note that we additionally scale the mantissa to make it a K-bit signed integer (K-1 bits for magnitude)
|
||||
exp -= (bits - 1);
|
||||
|
||||
// compute renormalized rounded mantissa for each component
|
||||
int mmask = (1 << 24) - 1;
|
||||
|
||||
for (size_t j = 0; j < stride_float; ++j)
|
||||
{
|
||||
int m = int(ldexp(v[j], -exp) + (v[j] >= 0 ? 0.5f : -0.5f));
|
||||
int exp = (mode == meshopt_EncodeExpSharedVector) ? vector_exp : component_exp[j];
|
||||
|
||||
// note that we additionally scale the mantissa to make it a K-bit signed integer (K-1 bits for magnitude)
|
||||
exp -= (bits - 1);
|
||||
|
||||
// compute renormalized rounded mantissa for each component
|
||||
int mmask = (1 << 24) - 1;
|
||||
|
||||
int m = int(v[j] * optexp2(-exp) + (v[j] >= 0 ? 0.5f : -0.5f));
|
||||
|
||||
d[j] = (m & mmask) | (unsigned(exp) << 24);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user