#ifndef RASTERIZER_CANVAS_BATCHER_H #define RASTERIZER_CANVAS_BATCHER_H /* rasterizer_canvas_batcher.h */ #include "core/config/project_settings.h" #include "core/os/os.h" #include "rasterizer_array.h" #include "rasterizer_asserts.h" #include "rasterizer_storage_common.h" #include "servers/rendering/rasterizer.h" #include "servers/rendering/rendering_server_canvas_helper.h" // We are using the curiously recurring template pattern // https://en.wikipedia.org/wiki/Curiously_recurring_template_pattern // For static polymorphism. // This makes it super easy to access // data / call funcs in the derived rasterizers from the base without writing and // maintaining a boatload of virtual functions. // In addition it assures that vtable will not be used and the function calls can be optimized, // because it gives compile time static polymorphism. // These macros makes it simpler and less verbose to define (and redefine) the inline functions // template preamble #define T_PREAMBLE template // class preamble #define C_PREAMBLE RasterizerCanvasBatcher // generic preamble #define PREAMBLE(RET_T) \ T_PREAMBLE \ RET_T C_PREAMBLE template class RasterizerCanvasBatcher { public: // used to determine whether we use hardware transform (none) // software transform all verts, or software transform just a translate // (no rotate or scale) enum TransformMode { TM_NONE, TM_ALL, TM_TRANSLATE, }; // pod versions of vector and color and RID, need to be 32 bit for vertex format struct BatchVector2 { float x, y; void set(float xx, float yy) { x = xx; y = yy; } void set(const Vector2 &p_o) { x = p_o.x; y = p_o.y; } void to(Vector2 &r_o) const { r_o.x = x; r_o.y = y; } }; struct BatchColor { float r, g, b, a; void set_white() { r = 1.0f; g = 1.0f; b = 1.0f; a = 1.0f; } void set(const Color &p_c) { r = p_c.r; g = p_c.g; b = p_c.b; a = p_c.a; } void set(float rr, float gg, float bb, float aa) { r = rr; g = gg; b = bb; a = aa; } bool operator==(const BatchColor &p_c) const { return (r == p_c.r) && (g == p_c.g) && (b == p_c.b) && (a == p_c.a); } bool operator!=(const BatchColor &p_c) const { return (*this == p_c) == false; } bool equals(const Color &p_c) const { return (r == p_c.r) && (g == p_c.g) && (b == p_c.b) && (a == p_c.a); } const float *get_data() const { return &r; } String to_string() const { String sz = "{"; const float *data = get_data(); for (int c = 0; c < 4; c++) { float f = data[c]; int val = ((f * 255.0f) + 0.5f); sz += String(Variant(val)) + " "; } sz += "}"; return sz; } }; // simplest FVF - local or baked position struct BatchVertex { // must be 32 bit pod BatchVector2 pos; BatchVector2 uv; }; // simple FVF but also incorporating baked color struct BatchVertexColored : public BatchVertex { // must be 32 bit pod BatchColor col; }; // if we are using normal mapping, we need light angles to be sent struct BatchVertexLightAngled : public BatchVertexColored { // must be pod float light_angle; }; // CUSTOM SHADER vertex formats. These are larger but will probably // be needed with custom shaders in order to have the data accessible in the shader. // if we are using COLOR in vertex shader but not position (VERTEX) struct BatchVertexModulated : public BatchVertexLightAngled { BatchColor modulate; }; struct BatchTransform { BatchVector2 translate; BatchVector2 basis[2]; }; // last resort, specially for custom shader, we put everything possible into a huge FVF // not very efficient, but better than no batching at all. struct BatchVertexLarge : public BatchVertexModulated { // must be pod BatchTransform transform; }; // Batch should be as small as possible, and ideally nicely aligned (is 32 bytes at the moment) struct Batch { RasterizerStorageCommon::BatchType type; // should be 16 bit uint16_t batch_texture_id; // also item reference number uint32_t first_command; // in the case of DEFAULT, this is num commands. // with rects, is number of command and rects. // with lines, is number of lines // with polys, is number of indices (actual rendered verts) uint32_t num_commands; // first vertex of this batch in the vertex lists uint32_t first_vert; // we can keep the batch structure small because we either need to store // the color if a handled batch, or the parent item if a default batch, so // we can reference the correct originating command union { BatchColor color; // for default batches we will store the parent item const RasterizerCanvas::Item *item; }; uint32_t get_num_verts() const { switch (type) { default: { } break; case RasterizerStorageCommon::BT_RECT: { return num_commands * 4; } break; case RasterizerStorageCommon::BT_LINE: { return num_commands * 2; } break; case RasterizerStorageCommon::BT_LINE_AA: { return num_commands * 2; } break; case RasterizerStorageCommon::BT_POLY: { return num_commands; } break; } // error condition WARN_PRINT_ONCE("reading num_verts from incorrect batch type"); return 0; } }; struct BatchTex { enum TileMode : uint32_t { TILE_OFF, TILE_NORMAL, TILE_FORCE_REPEAT, }; RID RID_texture; RID RID_normal; TileMode tile_mode; BatchVector2 tex_pixel_size; uint32_t flags; }; // items in a list to be sorted prior to joining struct BSortItem { // have a function to keep as pod, rather than operator void assign(const BSortItem &o) { item = o.item; z_index = o.z_index; } RasterizerCanvas::Item *item; int z_index; }; // batch item may represent 1 or more items struct BItemJoined { uint32_t first_item_ref; uint32_t num_item_refs; Rect2 bounding_rect; // note the z_index may only be correct for the first of the joined item references // this has implications for light culling with z ranged lights. int16_t z_index; // these are defined in RasterizerStorageCommon::BatchFlags uint16_t flags; // we are always splitting items with lots of commands, // and items with unhandled primitives (default) bool is_single_item() const { return (num_item_refs == 1); } bool use_attrib_transform() const { return flags & RasterizerStorageCommon::USE_LARGE_FVF; } }; struct BItemRef { RasterizerCanvas::Item *item; Color final_modulate; }; struct BLightRegion { void reset() { light_bitfield = 0; shadow_bitfield = 0; } uint64_t light_bitfield; uint64_t shadow_bitfield; }; struct BatchData { BatchData() { reset_flush(); reset_joined_item(); gl_vertex_buffer = 0; gl_index_buffer = 0; max_quads = 0; vertex_buffer_size_units = 0; vertex_buffer_size_bytes = 0; index_buffer_size_units = 0; index_buffer_size_bytes = 0; use_colored_vertices = false; settings_use_batching = false; settings_max_join_item_commands = 0; settings_colored_vertex_format_threshold = 0.0f; scissor_threshold_area = 0.0f; joined_item_batch_flags = 0; diagnose_frame = false; next_diagnose_tick = 10000; diagnose_frame_number = 9999999999; // some high number join_across_z_indices = true; settings_item_reordering_lookahead = 0; settings_use_batching_original_choice = false; settings_flash_batching = false; settings_diagnose_frame = false; settings_scissor_lights = false; settings_scissor_threshold = -1.0f; settings_use_single_rect_fallback = false; settings_ninepatch_mode = 0; // default settings_light_max_join_items = 16; settings_uv_contract = false; settings_uv_contract_amount = 0.0f; buffer_mode_batch_upload_send_null = true; buffer_mode_batch_upload_flag_stream = false; stats_items_sorted = 0; stats_light_items_joined = 0; } // called for each joined item void reset_joined_item() { // noop but left in as a stub } // called after each flush void reset_flush() { batches.reset(); batch_textures.reset(); vertices.reset(); light_angles.reset(); vertex_colors.reset(); vertex_modulates.reset(); vertex_transforms.reset(); total_quads = 0; total_verts = 0; total_color_changes = 0; use_light_angles = false; use_modulate = false; use_large_verts = false; fvf = RasterizerStorageCommon::FVF_REGULAR; } unsigned int gl_vertex_buffer; unsigned int gl_index_buffer; uint32_t max_quads; uint32_t vertex_buffer_size_units; uint32_t vertex_buffer_size_bytes; uint32_t index_buffer_size_units; uint32_t index_buffer_size_bytes; // small vertex FVF type - pos and UV. // This will always be written to initially, but can be translated // to larger FVFs if necessary. RasterizerArray vertices; // extra data which can be stored during prefilling, for later translation to larger FVFs RasterizerArray light_angles; RasterizerArray vertex_colors; // these aren't usually used, but are for polys RasterizerArray vertex_modulates; RasterizerArray vertex_transforms; // instead of having a different buffer for each vertex FVF type // we have a special array big enough for the biggest FVF // which can have a changeable unit size, and reuse it. RasterizerUnitArray unit_vertices; RasterizerArray batches; RasterizerArray batches_temp; // used for translating to colored vertex batches RasterizerArray_non_pod batch_textures; // the only reason this is non-POD is because of RIDs // SHOULD THESE BE IN FILLSTATE? // flexible vertex format. // all verts have pos and UV. // some have color, some light angles etc. RasterizerStorageCommon::FVF fvf; bool use_colored_vertices; bool use_light_angles; bool use_modulate; bool use_large_verts; // if the shader is using MODULATE, we prevent baking color so the final_modulate can // be read in the shader. // if the shader is reading VERTEX, we prevent baking vertex positions with extra matrices etc // to prevent the read position being incorrect. // These flags are defined in RasterizerStorageCommon::BatchFlags uint32_t joined_item_batch_flags; RasterizerArray items_joined; RasterizerArray item_refs; // items are sorted prior to joining RasterizerArray sort_items; // counts int total_quads; int total_verts; // we keep a record of how many color changes caused new batches // if the colors are causing an excessive number of batches, we switch // to alternate batching method and add color to the vertex format. int total_color_changes; // measured in pixels, recalculated each frame float scissor_threshold_area; // diagnose this frame, every nTh frame when settings_diagnose_frame is on bool diagnose_frame; String frame_string; uint32_t next_diagnose_tick; uint64_t diagnose_frame_number; // whether to join items across z_indices - this can interfere with z ranged lights, // so has to be disabled in some circumstances bool join_across_z_indices; // global settings bool settings_use_batching; // the current use_batching (affected by flash) bool settings_use_batching_original_choice; // the choice entered in project settings bool settings_flash_batching; // for regression testing, flash between non-batched and batched renderer bool settings_diagnose_frame; // print out batches to help optimize / regression test int settings_max_join_item_commands; float settings_colored_vertex_format_threshold; bool settings_scissor_lights; float settings_scissor_threshold; // 0.0 to 1.0 int settings_item_reordering_lookahead; bool settings_use_single_rect_fallback; int settings_light_max_join_items; int settings_ninepatch_mode; // buffer orphaning modes bool buffer_mode_batch_upload_send_null; bool buffer_mode_batch_upload_flag_stream; // uv contraction bool settings_uv_contract; float settings_uv_contract_amount; // only done on diagnose frame void reset_stats() { stats_items_sorted = 0; stats_light_items_joined = 0; } // frame stats (just for monitoring and debugging) int stats_items_sorted; int stats_light_items_joined; } bdata; struct FillState { void reset_flush() { // don't reset members that need to be preserved after flushing // half way through a list of commands curr_batch = nullptr; batch_tex_id = -1; texpixel_size = Vector2(1, 1); contract_uvs = false; sequence_batch_type_flags = 0; } void reset_joined_item(bool p_is_single_item, bool p_use_attrib_transform) { reset_flush(); is_single_item = p_is_single_item; use_attrib_transform = p_use_attrib_transform; use_software_transform = !is_single_item && !use_attrib_transform; extra_matrix_sent = false; } // for batching multiple types, we don't allow mixing RECTs / LINEs etc. // using flags allows quicker rejection of sequences with different batch types uint32_t sequence_batch_type_flags; Batch *curr_batch; int batch_tex_id; bool is_single_item; bool use_attrib_transform; bool use_software_transform; bool contract_uvs; Vector2 texpixel_size; Color final_modulate; TransformMode transform_mode; TransformMode orig_transform_mode; // support for extra matrices bool extra_matrix_sent; // whether sent on this item (in which case software transform can't be used untl end of item) int transform_extra_command_number_p1; // plus one to allow fast checking against zero Transform2D transform_combined; // final * extra Transform2D skeleton_base_inverse_xform; // used in software skinning }; // used during try_join struct RenderItemState { RenderItemState() { reset(); } void reset() { current_clip = nullptr; shader_cache = nullptr; rebind_shader = true; prev_use_skeleton = false; last_blend_mode = -1; canvas_last_material = RID(); item_group_z = 0; final_modulate = Color(-1.0, -1.0, -1.0, -1.0); // just something unlikely joined_item_batch_type_flags_curr = 0; joined_item_batch_type_flags_prev = 0; joined_item = nullptr; } RasterizerCanvas::Item *current_clip; typename T_STORAGE::Shader *shader_cache; bool rebind_shader; bool prev_use_skeleton; bool prev_distance_field; int last_blend_mode; RID canvas_last_material; Color final_modulate; // used for joining items only BItemJoined *joined_item; bool join_batch_break; BLightRegion light_region; // we need some logic to prevent joining items that have vastly different batch types // these are defined in RasterizerStorageCommon::BatchTypeFlags uint32_t joined_item_batch_type_flags_curr; uint32_t joined_item_batch_type_flags_prev; // 'item group' is data over a single call to canvas_render_items int item_group_z; Color item_group_modulate; Transform2D item_group_base_transform; } _render_item_state; bool use_nvidia_rect_workaround; ////////////////////////////////////////////////////////////////////////////// // End of structs used by the batcher. Beginning of funcs. private: // curiously recurring template pattern - allows access to functions in the DERIVED class // this is kind of like using virtual functions but more efficient as they are resolved at compile time T_STORAGE *get_storage() { return static_cast(this)->storage; } const T_STORAGE *get_storage() const { return static_cast(this)->storage; } T *get_this() { return static_cast(this); } const T *get_this() const { return static_cast(this); } protected: // main functions called from the rasterizer canvas void batch_constructor(); void batch_initialize(); void batch_canvas_begin(); void batch_canvas_end(); void batch_canvas_render_items_begin(const Color &p_modulate, const Transform2D &p_base_transform); void batch_canvas_render_items_end(); void batch_canvas_render_items(RasterizerCanvas::Item *p_item_list, int p_z, const Color &p_modulate, const Transform2D &p_base_transform); // recording and sorting items from the initial pass void record_items(RasterizerCanvas::Item *p_item_list, int p_z); void join_sorted_items(); void sort_items(); bool _sort_items_match(const BSortItem &p_a, const BSortItem &p_b) const; bool sort_items_from(int p_start); // joining logic bool _disallow_item_join_if_batch_types_too_different(RenderItemState &r_ris, uint32_t btf_allowed); bool _detect_item_batch_break(RenderItemState &r_ris, RasterizerCanvas::Item *p_ci, bool &r_batch_break); // drives the loop filling batches and flushing void render_joined_item_commands(const BItemJoined &p_bij, RasterizerCanvas::Item *p_current_clip, bool &r_reclip, typename T_STORAGE::Material *p_material, bool p_lit, const RenderItemState &p_ris); private: // flush once full or end of joined item void flush_render_batches(RasterizerCanvas::Item *p_first_item, RasterizerCanvas::Item *p_current_clip, bool &r_reclip, typename T_STORAGE::Material *p_material, uint32_t p_sequence_batch_type_flags); // a single joined item can contain multiple itemrefs, and thus create lots of batches bool prefill_joined_item(FillState &r_fill_state, int &r_command_start, RasterizerCanvas::Item *p_item, RasterizerCanvas::Item *p_current_clip, bool &r_reclip, typename T_STORAGE::Material *p_material); // prefilling different types of batch // default batch is an 'unhandled' legacy type batch that will be drawn with the legacy path, // all other batches are accelerated. void _prefill_default_batch(FillState &r_fill_state, int p_command_num, const RasterizerCanvas::Item &p_item); // accelerated batches bool _prefill_line(RasterizerCanvas::Item::CommandLine *p_line, FillState &r_fill_state, int &r_command_start, int command_num, int command_count, RasterizerCanvas::Item *p_item, bool multiply_final_modulate); template bool _prefill_ninepatch(RasterizerCanvas::Item::CommandNinePatch *p_np, FillState &r_fill_state, int &r_command_start, int command_num, int command_count, RasterizerCanvas::Item *p_item, bool multiply_final_modulate); template bool _prefill_polygon(RasterizerCanvas::Item::CommandPolygon *p_poly, FillState &r_fill_state, int &r_command_start, int command_num, int command_count, RasterizerCanvas::Item *p_item, bool multiply_final_modulate); template bool _prefill_rect(RasterizerCanvas::Item::CommandRect *rect, FillState &r_fill_state, int &r_command_start, int command_num, int command_count, RasterizerCanvas::Item::Command *const *commands, RasterizerCanvas::Item *p_item, bool multiply_final_modulate); template bool _prefill_multirect(RasterizerCanvas::Item::CommandMultiRect *mrect, FillState &r_fill_state, int &r_command_start, int command_num, bool multiply_final_modulate); // dealing with textures int _batch_find_or_create_tex(const RID &p_texture, const RID &p_normal, bool p_tile, int p_previous_match); protected: // legacy support for non batched mode void _legacy_canvas_item_render_commands(RasterizerCanvas::Item *p_item, RasterizerCanvas::Item *p_current_clip, bool &r_reclip, typename T_STORAGE::Material *p_material); // light scissoring bool _light_scissor_begin(const Rect2 &p_item_rect, const Transform2D &p_light_xform, const Rect2 &p_light_rect) const; bool _light_find_intersection(const Rect2 &p_item_rect, const Transform2D &p_light_xform, const Rect2 &p_light_rect, Rect2 &r_cliprect) const; void _calculate_scissor_threshold_area(); private: // translating vertex formats prior to rendering void _translate_batches_to_vertex_colored_FVF(); template void _translate_batches_to_larger_FVF(uint32_t p_sequence_batch_type_flags); protected: // accessory funcs void _software_transform_vertex(BatchVector2 &r_v, const Transform2D &p_tr) const; void _software_transform_vertex(Vector2 &r_v, const Transform2D &p_tr) const; TransformMode _find_transform_mode(const Transform2D &p_tr) const { // decided whether to do translate only for software transform if ((p_tr.columns[0].x == 1.0f) && (p_tr.columns[0].y == 0.0f) && (p_tr.columns[1].x == 0.0f) && (p_tr.columns[1].y == 1.0f)) { return TM_TRANSLATE; } return TM_ALL; } typename T_STORAGE::Texture *_get_canvas_texture(const RID &p_texture) const { if (p_texture.is_valid()) { typename T_STORAGE::Texture *texture = get_storage()->texture_owner.getornull(p_texture); if (texture) { // could be a proxy texture (e.g. animated) if (texture->proxy) { // take care to prevent infinite loop int count = 0; while (texture->proxy) { texture = texture->proxy; count++; ERR_FAIL_COND_V_MSG(count == 16, nullptr, "Texture proxy infinite loop detected."); } } return texture->get_ptr(); } } return nullptr; } public: Batch *_batch_request_new(bool p_blank = true) { Batch *batch = bdata.batches.request(); if (!batch) { // grow the batches bdata.batches.grow(); // and the temporary batches (used for color verts) bdata.batches_temp.reset(); bdata.batches_temp.grow(); // this should always succeed after growing batch = bdata.batches.request(); RAST_DEBUG_ASSERT(batch); } if (p_blank) { memset(batch, 0, sizeof(Batch)); } else { batch->item = nullptr; } return batch; } BatchVertex *_batch_vertex_request_new() { return bdata.vertices.request(); } protected: // no need to compile these in in release, they are unneeded outside the editor and only add to executable size #if defined(TOOLS_ENABLED) && defined(DEBUG_ENABLED) #include "batch_diagnose.inc" #endif }; PREAMBLE(void)::batch_canvas_begin() { // diagnose_frame? bdata.frame_string = ""; // just in case, always set this as we don't want a string leak in release... #if defined(TOOLS_ENABLED) && defined(DEBUG_ENABLED) if (bdata.settings_diagnose_frame) { bdata.diagnose_frame = false; uint32_t tick = OS::get_singleton()->get_ticks_msec(); uint64_t frame = Engine::get_singleton()->get_frames_drawn(); if (tick >= bdata.next_diagnose_tick) { bdata.next_diagnose_tick = tick + 10000; // the plus one is prevent starting diagnosis half way through frame bdata.diagnose_frame_number = frame + 1; } if (frame == bdata.diagnose_frame_number) { bdata.diagnose_frame = true; bdata.reset_stats(); } if (bdata.diagnose_frame) { bdata.frame_string = "canvas_begin FRAME " + itos(frame) + "\n"; } } #endif } PREAMBLE(void)::batch_canvas_end() { #if defined(TOOLS_ENABLED) && defined(DEBUG_ENABLED) if (bdata.diagnose_frame) { bdata.frame_string += "canvas_end\n"; if (bdata.stats_items_sorted) { bdata.frame_string += "\titems reordered: " + itos(bdata.stats_items_sorted) + "\n"; } if (bdata.stats_light_items_joined) { bdata.frame_string += "\tlight items joined: " + itos(bdata.stats_light_items_joined) + "\n"; } print_line(bdata.frame_string); } #endif } PREAMBLE(void)::batch_canvas_render_items_begin(const Color &p_modulate, const Transform2D &p_base_transform) { // if we are debugging, flash each frame between batching renderer and old version to compare for regressions if (bdata.settings_flash_batching) { if ((Engine::get_singleton()->get_frames_drawn() % 2) == 0) { bdata.settings_use_batching = true; } else { bdata.settings_use_batching = false; } } if (!bdata.settings_use_batching) { return; } // this only needs to be done when screen size changes, but this should be // infrequent enough _calculate_scissor_threshold_area(); // set up render item state for all the z_indexes (this is common to all z_indexes) _render_item_state.reset(); _render_item_state.item_group_modulate = p_modulate; _render_item_state.item_group_base_transform = p_base_transform; _render_item_state.light_region.reset(); // batch break must be preserved over the different z indices, // to prevent joining to an item on a previous index if not allowed _render_item_state.join_batch_break = false; // whether to join across z indices depends on whether there are z ranged lights. // joined z_index items can be wrongly classified with z ranged lights. bdata.join_across_z_indices = true; } PREAMBLE(void)::batch_canvas_render_items_end() { if (!bdata.settings_use_batching) { return; } join_sorted_items(); #if defined(TOOLS_ENABLED) && defined(DEBUG_ENABLED) if (bdata.diagnose_frame) { bdata.frame_string += "items\n"; } #endif // batching render is deferred until after going through all the z_indices, joining all the items get_this()->canvas_render_items_implementation(nullptr, 0, _render_item_state.item_group_modulate, _render_item_state.item_group_base_transform); bdata.items_joined.reset(); bdata.item_refs.reset(); bdata.sort_items.reset(); } PREAMBLE(void)::batch_canvas_render_items(RasterizerCanvas::Item *p_item_list, int p_z, const Color &p_modulate, const Transform2D &p_base_transform) { // stage 1 : join similar items, so that their state changes are not repeated, // and commands from joined items can be batched together if (bdata.settings_use_batching) { record_items(p_item_list, p_z); return; } // only legacy renders at this stage, batched renderer doesn't render until canvas_render_items_end() get_this()->canvas_render_items_implementation(p_item_list, p_z, p_modulate, p_base_transform); } // Default batches will not occur in software transform only items // EXCEPT IN THE CASE OF SINGLE RECTS (and this may well not occur, check the logic in prefill_join_item TYPE_RECT) // but can occur where transform commands have been sent during hardware batch PREAMBLE(void)::_prefill_default_batch(FillState &r_fill_state, int p_command_num, const RasterizerCanvas::Item &p_item) { if (r_fill_state.curr_batch->type == RasterizerStorageCommon::BT_DEFAULT) { // don't need to flush an extra transform command? if (!r_fill_state.transform_extra_command_number_p1) { // another default command, just add to the existing batch r_fill_state.curr_batch->num_commands++; // Note this is getting hit, needs investigation as to whether this is a bug or a false flag // DEV_CHECK_ONCE(r_fill_state.curr_batch->num_commands <= p_command_num); } else { #if defined(TOOLS_ENABLED) && defined(DEBUG_ENABLED) if (r_fill_state.transform_extra_command_number_p1 != p_command_num) { WARN_PRINT_ONCE("_prefill_default_batch : transform_extra_command_number_p1 != p_command_num"); } #endif // if the first member of the batch is a transform we have to be careful if (!r_fill_state.curr_batch->num_commands) { // there can be leading useless extra transforms (sometimes happens with debug collision polys) // we need to rejig the first_command for the first useful transform r_fill_state.curr_batch->first_command += r_fill_state.transform_extra_command_number_p1 - 1; } // we do have a pending extra transform command to flush // either the extra transform is in the prior command, or not, in which case we need 2 batches r_fill_state.curr_batch->num_commands += 2; r_fill_state.transform_extra_command_number_p1 = 0; // mark as sent r_fill_state.extra_matrix_sent = true; // the original mode should always be hardware transform .. // test this assumption //CRASH_COND(r_fill_state.orig_transform_mode != TM_NONE); r_fill_state.transform_mode = r_fill_state.orig_transform_mode; // do we need to restore anything else? } } else { // end of previous different type batch, so start new default batch // first consider whether there is a dirty extra matrix to send if (r_fill_state.transform_extra_command_number_p1) { // get which command the extra is in, and blank all the records as it no longer is stored CPU side int extra_command = r_fill_state.transform_extra_command_number_p1 - 1; // plus 1 based r_fill_state.transform_extra_command_number_p1 = 0; r_fill_state.extra_matrix_sent = true; // send the extra to the GPU in a batch r_fill_state.curr_batch = _batch_request_new(); r_fill_state.curr_batch->type = RasterizerStorageCommon::BT_DEFAULT; r_fill_state.curr_batch->first_command = extra_command; r_fill_state.curr_batch->num_commands = 1; r_fill_state.curr_batch->item = &p_item; // revert to the original transform mode // e.g. go back to NONE if we were in hardware transform mode r_fill_state.transform_mode = r_fill_state.orig_transform_mode; // reset the original transform if we are going back to software mode, // because the extra is now done on the GPU... // (any subsequent extras are sent directly to the GPU, no deferring) if (r_fill_state.orig_transform_mode != TM_NONE) { r_fill_state.transform_combined = p_item.final_transform; } // can possibly combine batch with the next one in some cases // this is more efficient than having an extra batch especially for the extra if ((extra_command + 1) == p_command_num) { r_fill_state.curr_batch->num_commands = 2; return; } } // start default batch r_fill_state.curr_batch = _batch_request_new(); r_fill_state.curr_batch->type = RasterizerStorageCommon::BT_DEFAULT; r_fill_state.curr_batch->first_command = p_command_num; r_fill_state.curr_batch->num_commands = 1; r_fill_state.curr_batch->item = &p_item; } } PREAMBLE(int)::_batch_find_or_create_tex(const RID &p_texture, const RID &p_normal, bool p_tile, int p_previous_match) { // optimization .. in 99% cases the last matched value will be the same, so no need to traverse the list if (p_previous_match > 0) // if it is zero, it will get hit first in the linear search anyway { const BatchTex &batch_texture = bdata.batch_textures[p_previous_match]; // note for future reference, if RID implementation changes, this could become more expensive if ((batch_texture.RID_texture == p_texture) && (batch_texture.RID_normal == p_normal)) { // tiling mode must also match bool tiles = batch_texture.tile_mode != BatchTex::TILE_OFF; if (tiles == p_tile) { // match! return p_previous_match; } } } // not the previous match .. we will do a linear search ... slower, but should happen // not very often except with non-batchable runs, which are going to be slow anyway // n.b. could possibly be replaced later by a fast hash table for (int n = 0; n < bdata.batch_textures.size(); n++) { const BatchTex &batch_texture = bdata.batch_textures[n]; if ((batch_texture.RID_texture == p_texture) && (batch_texture.RID_normal == p_normal)) { // tiling mode must also match bool tiles = batch_texture.tile_mode != BatchTex::TILE_OFF; if (tiles == p_tile) { // match! return n; } } } // pushing back from local variable .. not ideal but has to use a Vector because non pod // due to RIDs BatchTex new_batch_tex; new_batch_tex.RID_texture = p_texture; new_batch_tex.RID_normal = p_normal; // get the texture typename T_STORAGE::Texture *texture = _get_canvas_texture(p_texture); if (texture) { // special case, there can be textures with no width or height int w = texture->width; int h = texture->height; if (!w || !h) { w = 1; h = 1; } new_batch_tex.tex_pixel_size.x = 1.0 / w; new_batch_tex.tex_pixel_size.y = 1.0 / h; new_batch_tex.flags = texture->flags; } else { // maybe doesn't need doing... new_batch_tex.tex_pixel_size.x = 1.0f; new_batch_tex.tex_pixel_size.y = 1.0f; new_batch_tex.flags = 0; } if (p_tile) { if (texture) { // default new_batch_tex.tile_mode = BatchTex::TILE_NORMAL; // no hardware support for non power of 2 tiling if (!get_storage()->config.support_npot_repeat_mipmap) { if (next_power_of_2(texture->alloc_width) != (unsigned int)texture->alloc_width && next_power_of_2(texture->alloc_height) != (unsigned int)texture->alloc_height) { new_batch_tex.tile_mode = BatchTex::TILE_FORCE_REPEAT; } } } else { // this should not happen? new_batch_tex.tile_mode = BatchTex::TILE_OFF; } } else { new_batch_tex.tile_mode = BatchTex::TILE_OFF; } // push back bdata.batch_textures.push_back(new_batch_tex); return bdata.batch_textures.size() - 1; } PREAMBLE(void)::batch_constructor() { bdata.settings_use_batching = false; #ifdef GLES_OVER_GL use_nvidia_rect_workaround = GLOBAL_GET("rendering/2d/options/use_nvidia_rect_flicker_workaround"); #else // Not needed (a priori) on GLES devices use_nvidia_rect_workaround = false; #endif } PREAMBLE(void)::batch_initialize() { bdata.settings_use_batching = GLOBAL_GET("rendering/batching/options/use_batching"); bdata.settings_max_join_item_commands = GLOBAL_GET("rendering/batching/parameters/max_join_item_commands"); bdata.settings_colored_vertex_format_threshold = GLOBAL_GET("rendering/batching/parameters/colored_vertex_format_threshold"); bdata.settings_item_reordering_lookahead = GLOBAL_GET("rendering/batching/parameters/item_reordering_lookahead"); bdata.settings_light_max_join_items = GLOBAL_GET("rendering/batching/lights/max_join_items"); bdata.settings_use_single_rect_fallback = GLOBAL_GET("rendering/batching/options/single_rect_fallback"); bdata.settings_ninepatch_mode = GLOBAL_GET("rendering/2d/options/ninepatch_mode"); // allow user to override the api usage techniques using project settings int send_null_mode = GLOBAL_GET("rendering/2d/opengl/batching_send_null"); switch (send_null_mode) { default: { bdata.buffer_mode_batch_upload_send_null = true; } break; case 1: { bdata.buffer_mode_batch_upload_send_null = false; } break; case 2: { bdata.buffer_mode_batch_upload_send_null = true; } break; } int stream_mode = GLOBAL_GET("rendering/2d/opengl/batching_stream"); switch (stream_mode) { default: { bdata.buffer_mode_batch_upload_flag_stream = false; } break; case 1: { bdata.buffer_mode_batch_upload_flag_stream = false; } break; case 2: { bdata.buffer_mode_batch_upload_flag_stream = true; } break; } // alternatively only enable uv contract if pixel snap in use, // but with this enable bool, it should not be necessary bdata.settings_uv_contract = GLOBAL_GET("rendering/batching/precision/uv_contract"); bdata.settings_uv_contract_amount = (float)GLOBAL_GET("rendering/batching/precision/uv_contract_amount") / 1000000.0f; // we can use the threshold to determine whether to turn scissoring off or on bdata.settings_scissor_threshold = GLOBAL_GET("rendering/batching/lights/scissor_area_threshold"); if (bdata.settings_scissor_threshold > 0.999f) { bdata.settings_scissor_lights = false; } else { bdata.settings_scissor_lights = true; // apply power of 4 relationship for the area, as most of the important changes // will be happening at low values of scissor threshold bdata.settings_scissor_threshold *= bdata.settings_scissor_threshold; bdata.settings_scissor_threshold *= bdata.settings_scissor_threshold; } // The sweet spot on my desktop for cache is actually smaller than the max, and this // is the default. This saves memory too so we will use it for now, needs testing to see whether this varies according // to device / platform. int batch_buffer_num_verts_requested = GLOBAL_GET("rendering/batching/parameters/batch_buffer_size"); // override the use_batching setting in the editor // (note that if the editor can't start, you can't change the use_batching project setting!) if (Engine::get_singleton()->is_editor_hint()) { bool use_in_editor = GLOBAL_GET("rendering/batching/options/use_batching_in_editor"); bdata.settings_use_batching = use_in_editor; // fix some settings in the editor, as the performance not worth the risk bdata.settings_use_single_rect_fallback = false; } // if we are using batching, we will purposefully disable the nvidia workaround. // This is because the only reason to use the single rect fallback is the approx 2x speed // of the uniform drawing technique. If we used nvidia workaround, speed would be // approx equal to the batcher drawing technique (indexed primitive + VB). if (bdata.settings_use_batching) { use_nvidia_rect_workaround = false; } // For debugging, if flash is set in project settings, it will flash on alternate frames // between the non-batched renderer and the batched renderer, // in order to find regressions. // This should not be used except during development. // make a note of the original choice in case we are flashing on and off the batching bdata.settings_use_batching_original_choice = bdata.settings_use_batching; bdata.settings_flash_batching = GLOBAL_GET("rendering/batching/debug/flash_batching"); if (!bdata.settings_use_batching) { // no flash when batching turned off bdata.settings_flash_batching = false; } // frame diagnosis. print out the batches every nth frame bdata.settings_diagnose_frame = false; if (!Engine::get_singleton()->is_editor_hint() && bdata.settings_use_batching) { bdata.settings_diagnose_frame = GLOBAL_GET("rendering/batching/debug/diagnose_frame"); } // the maximum num quads in a batch is limited by GLES2. We can have only 16 bit indices, // which means we can address a vertex buffer of max size 65535. 4 vertices are needed per quad. // Note this determines the memory use by the vertex buffer vector. max quads (65536/4)-1 // but can be reduced to save memory if really required (will result in more batches though) const int max_possible_quads = (65536 / 4) - 1; // We must have enough quads to fit in a MultiRect const int min_possible_quads = MAX(8, MultiRect::MAX_RECTS); // some reasonable small value // value from project settings int max_quads = batch_buffer_num_verts_requested / 4; bool use_multirect = GLOBAL_GET("rendering/batching/options/use_multirect"); RenderingServerCanvasHelper::_multirect_enabled = (bdata.settings_use_batching && use_multirect); // sanity checks max_quads = CLAMP(max_quads, min_possible_quads, max_possible_quads); bdata.settings_max_join_item_commands = CLAMP(bdata.settings_max_join_item_commands, 0, 65535); bdata.settings_colored_vertex_format_threshold = CLAMP(bdata.settings_colored_vertex_format_threshold, 0.0f, 1.0f); bdata.settings_scissor_threshold = CLAMP(bdata.settings_scissor_threshold, 0.0f, 1.0f); bdata.settings_light_max_join_items = CLAMP(bdata.settings_light_max_join_items, 0, 65535); bdata.settings_item_reordering_lookahead = CLAMP(bdata.settings_item_reordering_lookahead, 0, 65535); // special case, for colored vertex format threshold. // as the comparison is >=, we want to be able to totally turn on or off // conversion to colored vertex format at the extremes, so we will force // 1.0 to be just above 1.0 if (bdata.settings_colored_vertex_format_threshold > 0.995f) { bdata.settings_colored_vertex_format_threshold = 1.01f; } // save memory when batching off if (!bdata.settings_use_batching) { max_quads = 0; } uint32_t sizeof_batch_vert = sizeof(BatchVertex); bdata.max_quads = max_quads; // 4 verts per quad bdata.vertex_buffer_size_units = max_quads * 4; // the index buffer can be longer than 65535, only the indices need to be within this range bdata.index_buffer_size_units = max_quads * 6; const int max_verts = bdata.vertex_buffer_size_units; // this comes out at approx 64K for non-colored vertex buffer, and 128K for colored vertex buffer bdata.vertex_buffer_size_bytes = max_verts * sizeof_batch_vert; bdata.index_buffer_size_bytes = bdata.index_buffer_size_units * 2; // 16 bit inds // For debug purposes, output a string with the batching options. if (bdata.settings_use_batching) { String batching_options_string = "OpenGL ES 2D Batching: ON\n"; batching_options_string += "Batching Options:\n"; batching_options_string += "\tmax_join_item_commands " + itos(bdata.settings_max_join_item_commands) + "\n"; batching_options_string += "\tcolored_vertex_format_threshold " + String(Variant(bdata.settings_colored_vertex_format_threshold)) + "\n"; batching_options_string += "\tbatch_buffer_effective_size " + itos(bdata.vertex_buffer_size_units) + "\n"; batching_options_string += "\tlight_scissor_area_threshold " + String(Variant(bdata.settings_scissor_threshold)) + "\n"; batching_options_string += "\titem_reordering_lookahead " + itos(bdata.settings_item_reordering_lookahead) + "\n"; batching_options_string += "\tlight_max_join_items " + itos(bdata.settings_light_max_join_items) + "\n"; batching_options_string += "\tsingle_rect_fallback " + String(Variant(bdata.settings_use_single_rect_fallback)) + "\n"; batching_options_string += "\tdebug_flash " + String(Variant(bdata.settings_flash_batching)) + "\n"; batching_options_string += "\tdiagnose_frame " + String(Variant(bdata.settings_diagnose_frame)); print_verbose(batching_options_string); } // create equal number of normal and (max) unit sized verts (as the normal may need to be translated to a larger FVF) bdata.vertices.create(max_verts); // 512k bdata.unit_vertices.create(max_verts, sizeof(BatchVertexLarge)); // extra data per vert needed for larger FVFs bdata.light_angles.create(max_verts); bdata.vertex_colors.create(max_verts); bdata.vertex_modulates.create(max_verts); bdata.vertex_transforms.create(max_verts); // num batches will be auto increased dynamically if required bdata.batches.create(1024); bdata.batches_temp.create(bdata.batches.max_size()); // batch textures can also be increased dynamically bdata.batch_textures.create(32); } PREAMBLE(bool)::_light_scissor_begin(const Rect2 &p_item_rect, const Transform2D &p_light_xform, const Rect2 &p_light_rect) const { float area_item = p_item_rect.size.x * p_item_rect.size.y; // double check these are always positive // quick reject .. the area of pixels saved can never be more than the area of the item if (area_item < bdata.scissor_threshold_area) { return false; } Rect2 cliprect; if (!_light_find_intersection(p_item_rect, p_light_xform, p_light_rect, cliprect)) { // should not really occur .. but just in case cliprect = Rect2(0, 0, 0, 0); } else { // some conditions not to scissor // determine the area (fill rate) that will be saved float area_cliprect = cliprect.size.x * cliprect.size.y; float area_saved = area_item - area_cliprect; // if area saved is too small, don't scissor if (area_saved < bdata.scissor_threshold_area) { return false; } } int rh = get_storage()->frame.current_rt->height; // using the exact size was leading to off by one errors, // possibly due to pixel snap. For this reason we will boost // the scissor area by 1 pixel, this will take care of any rounding // issues, and shouldn't significantly negatively impact performance. int y = rh - (cliprect.position.y + cliprect.size.y); y += 1; // off by 1 boost before flipping if (get_storage()->frame.current_rt->flags[RasterizerStorage::RENDER_TARGET_VFLIP]) { y = cliprect.position.y; } get_this()->gl_enable_scissor(cliprect.position.x - 1, y, cliprect.size.width + 2, cliprect.size.height + 2); return true; } PREAMBLE(bool)::_light_find_intersection(const Rect2 &p_item_rect, const Transform2D &p_light_xform, const Rect2 &p_light_rect, Rect2 &r_cliprect) const { // transform light to world space (note this is done in the earlier intersection test, so could // be made more efficient) Vector2 pts[4] = { p_light_xform.xform(p_light_rect.position), p_light_xform.xform(Vector2(p_light_rect.position.x + p_light_rect.size.x, p_light_rect.position.y)), p_light_xform.xform(Vector2(p_light_rect.position.x, p_light_rect.position.y + p_light_rect.size.y)), p_light_xform.xform(Vector2(p_light_rect.position.x + p_light_rect.size.x, p_light_rect.position.y + p_light_rect.size.y)), }; // calculate the light bound rect in world space Rect2 lrect(pts[0].x, pts[0].y, 0, 0); for (int n = 1; n < 4; n++) { lrect.expand_to(pts[n]); } // intersection between the 2 rects // they should probably always intersect, because of earlier check, but just in case... if (!p_item_rect.intersects(lrect)) { return false; } // note this does almost the same as Rect2.clip but slightly more efficient for our use case r_cliprect.position.x = MAX(p_item_rect.position.x, lrect.position.x); r_cliprect.position.y = MAX(p_item_rect.position.y, lrect.position.y); Point2 item_rect_end = p_item_rect.position + p_item_rect.size; Point2 lrect_end = lrect.position + lrect.size; r_cliprect.size.x = MIN(item_rect_end.x, lrect_end.x) - r_cliprect.position.x; r_cliprect.size.y = MIN(item_rect_end.y, lrect_end.y) - r_cliprect.position.y; return true; } PREAMBLE(void)::_calculate_scissor_threshold_area() { if (!bdata.settings_scissor_lights) { return; } // scissor area threshold is 0.0 to 1.0 in the settings for ease of use. // we need to translate to an absolute area to determine quickly whether // to scissor. if (bdata.settings_scissor_threshold < 0.0001f) { bdata.scissor_threshold_area = -1.0f; // will always pass } else { // in pixels int w = get_storage()->frame.current_rt->width; int h = get_storage()->frame.current_rt->height; int screen_area = w * h; bdata.scissor_threshold_area = bdata.settings_scissor_threshold * screen_area; } } PREAMBLE(bool)::_prefill_line(RasterizerCanvas::Item::CommandLine *p_line, FillState &r_fill_state, int &r_command_start, int command_num, int command_count, RasterizerCanvas::Item *p_item, bool multiply_final_modulate) { bool change_batch = false; // we have separate batch types for non and anti aliased lines. // You can't batch the different types together. RasterizerStorageCommon::BatchType line_batch_type = RasterizerStorageCommon::BT_LINE; uint32_t line_batch_flags = RasterizerStorageCommon::BTF_LINE; #ifdef GLES_OVER_GL if (p_line->antialiased) { line_batch_type = RasterizerStorageCommon::BT_LINE_AA; line_batch_flags = RasterizerStorageCommon::BTF_LINE_AA; } #endif // conditions for creating a new batch if (r_fill_state.curr_batch->type != line_batch_type) { if (r_fill_state.sequence_batch_type_flags & (~line_batch_flags)) { // don't allow joining to a different sequence type r_command_start = command_num; return true; } r_fill_state.sequence_batch_type_flags |= line_batch_flags; change_batch = true; } // get the baked line color Color col = p_line->color; if (multiply_final_modulate) { col *= r_fill_state.final_modulate; } BatchColor bcol; bcol.set(col); // if the color has changed we need a new batch // (only single color line batches supported so far) if (!change_batch && r_fill_state.curr_batch->color != bcol) { change_batch = true; } // not sure if needed r_fill_state.batch_tex_id = -1; // try to create vertices BEFORE creating a batch, // because if the vertex buffer is full, we need to finish this // function, draw what we have so far, and then start a new set of batches // request multiple vertices at a time, this is more efficient BatchVertex *bvs = bdata.vertices.request(2); if (!bvs) { // run out of space in the vertex buffer .. finish this function and draw what we have so far // return where we got to r_command_start = command_num; return true; } if (change_batch) { // open new batch (this should never fail, it dynamically grows) r_fill_state.curr_batch = _batch_request_new(false); r_fill_state.curr_batch->type = line_batch_type; r_fill_state.curr_batch->color = bcol; // cast is to stop sanitizer benign warning .. watch though in case destination type changes r_fill_state.curr_batch->batch_texture_id = (uint16_t)-1; r_fill_state.curr_batch->first_command = command_num; r_fill_state.curr_batch->num_commands = 1; //r_fill_state.curr_batch->first_quad = bdata.total_quads; r_fill_state.curr_batch->first_vert = bdata.total_verts; } else { // we could alternatively do the count when closing a batch .. perhaps more efficient r_fill_state.curr_batch->num_commands++; } // fill the geometry Vector2 from = p_line->from; Vector2 to = p_line->to; const bool use_large_verts = bdata.use_large_verts; if ((r_fill_state.transform_mode != TM_NONE) && (!use_large_verts)) { _software_transform_vertex(from, r_fill_state.transform_combined); _software_transform_vertex(to, r_fill_state.transform_combined); } bvs[0].pos.set(from); bvs[0].uv.set(0, 0); // may not be necessary bvs[1].pos.set(to); bvs[1].uv.set(0, 0); bdata.total_verts += 2; return false; } //unsigned int _ninepatch_apply_tiling_modes(RasterizerCanvas::Item::CommandNinePatch *p_np, Rect2 &r_source) { // unsigned int rect_flags = 0; // switch (p_np->axis_x) { // default: // break; // case RenderingServer::NINE_PATCH_TILE: { // r_source.size.x = p_np->rect.size.x; // rect_flags = RasterizerCanvas::CANVAS_RECT_TILE; // } break; // case RenderingServer::NINE_PATCH_TILE_FIT: { // // prevent divide by zero (may never happen) // if (r_source.size.x) { // int units = p_np->rect.size.x / r_source.size.x; // if (!units) // units++; // r_source.size.x = r_source.size.x * units; // rect_flags = RasterizerCanvas::CANVAS_RECT_TILE; // } // } break; // } // switch (p_np->axis_y) { // default: // break; // case RenderingServer::NINE_PATCH_TILE: { // r_source.size.y = p_np->rect.size.y; // rect_flags = RasterizerCanvas::CANVAS_RECT_TILE; // } break; // case RenderingServer::NINE_PATCH_TILE_FIT: { // // prevent divide by zero (may never happen) // if (r_source.size.y) { // int units = p_np->rect.size.y / r_source.size.y; // if (!units) // units++; // r_source.size.y = r_source.size.y * units; // rect_flags = RasterizerCanvas::CANVAS_RECT_TILE; // } // } break; // } // return rect_flags; //} T_PREAMBLE template bool C_PREAMBLE::_prefill_ninepatch(RasterizerCanvas::Item::CommandNinePatch *p_np, FillState &r_fill_state, int &r_command_start, int command_num, int command_count, RasterizerCanvas::Item *p_item, bool multiply_final_modulate) { typename T_STORAGE::Texture *tex = _get_canvas_texture(p_np->texture); if (!tex) { // FIXME: Handle textureless ninepatch gracefully WARN_PRINT("NinePatch without texture not supported yet, skipping."); return false; } if (tex->width == 0 || tex->height == 0) { WARN_PRINT("Cannot set empty texture to NinePatch."); return false; } // cope with ninepatch of zero area. These cannot be created by the user interface or gdscript, but can // be created programmatically from c++, e.g. by the Pandemonium UI for sliders. We will just not draw these. if ((p_np->rect.size.x * p_np->rect.size.y) <= 0.0f) { return false; } // conditions for creating a new batch if (r_fill_state.curr_batch->type != RasterizerStorageCommon::BT_RECT) { // don't allow joining to a different sequence type if (r_fill_state.sequence_batch_type_flags & (~RasterizerStorageCommon::BTF_RECT)) { // don't allow joining to a different sequence type r_command_start = command_num; return true; } } // first check there are enough verts for this to complete successfully if (bdata.vertices.size() + (4 * 9) > bdata.vertices.max_size()) { // return where we got to r_command_start = command_num; return true; } // create a temporary rect so we can reuse the rect routine RasterizerCanvas::Item::CommandRect trect; trect.texture = p_np->texture; trect.normal_map = p_np->normal_map; trect.modulate = p_np->color; trect.flags = RasterizerCanvas::CANVAS_RECT_REGION; //Size2 texpixel_size(1.0f / tex->width, 1.0f / tex->height); Rect2 source = p_np->source; if (source.size.x == 0 && source.size.y == 0) { source.size.x = tex->width; source.size.y = tex->height; } float screen_scale = 1.0f; // optional crazy ninepatch scaling mode if ((bdata.settings_ninepatch_mode == 1) && (source.size.x != 0) && (source.size.y != 0)) { screen_scale = MIN(p_np->rect.size.x / source.size.x, p_np->rect.size.y / source.size.y); screen_scale = MIN(1.0, screen_scale); } // deal with nine patch texture wrapping modes // this is switched off because it may not be possible with batching // trect.flags |= _ninepatch_apply_tiling_modes(p_np, source); // translate to rects Rect2 &rt = trect.rect; Rect2 &src = trect.source; float tex_margin_left = p_np->margin[MARGIN_LEFT]; float tex_margin_right = p_np->margin[MARGIN_RIGHT]; float tex_margin_top = p_np->margin[MARGIN_TOP]; float tex_margin_bottom = p_np->margin[MARGIN_BOTTOM]; float x[4]; x[0] = p_np->rect.position.x; x[1] = x[0] + (p_np->margin[MARGIN_LEFT] * screen_scale); x[3] = x[0] + (p_np->rect.size.x); x[2] = x[3] - (p_np->margin[MARGIN_RIGHT] * screen_scale); float y[4]; y[0] = p_np->rect.position.y; y[1] = y[0] + (p_np->margin[MARGIN_TOP] * screen_scale); y[3] = y[0] + (p_np->rect.size.y); y[2] = y[3] - (p_np->margin[MARGIN_BOTTOM] * screen_scale); float u[4]; u[0] = source.position.x; u[1] = u[0] + tex_margin_left; u[3] = u[0] + source.size.x; u[2] = u[3] - tex_margin_right; float v[4]; v[0] = source.position.y; v[1] = v[0] + tex_margin_top; v[3] = v[0] + source.size.y; v[2] = v[3] - tex_margin_bottom; // Some protection for the use of ninepatches with rect size smaller than margin size. // Note these cannot be produced by the UI, only programmatically, and the results // are somewhat undefined, because the margins overlap. // Ninepatch get_minimum_size() forces minimum size to be the sum of the margins. // So this should occur very rarely if ever. Consider commenting these 4 lines out for higher speed // in ninepatches. x[1] = MIN(x[1], x[3]); x[2] = MIN(x[2], x[3]); y[1] = MIN(y[1], y[3]); y[2] = MIN(y[2], y[3]); // temporarily override to prevent single rect fallback bool single_rect_fallback = bdata.settings_use_single_rect_fallback; bdata.settings_use_single_rect_fallback = false; // each line of the ninepatch for (int line = 0; line < 3; line++) { rt.position = Vector2(x[0], y[line]); rt.size = Vector2(x[1] - x[0], y[line + 1] - y[line]); src.position = Vector2(u[0], v[line]); src.size = Vector2(u[1] - u[0], v[line + 1] - v[line]); _prefill_rect(&trect, r_fill_state, r_command_start, command_num, command_count, nullptr, p_item, multiply_final_modulate); if ((line == 1) && (!p_np->draw_center)) { ; } else { rt.position.x = x[1]; rt.size.x = x[2] - x[1]; src.position.x = u[1]; src.size.x = u[2] - u[1]; _prefill_rect(&trect, r_fill_state, r_command_start, command_num, command_count, nullptr, p_item, multiply_final_modulate); } rt.position.x = x[2]; rt.size.x = x[3] - x[2]; src.position.x = u[2]; src.size.x = u[3] - u[2]; _prefill_rect(&trect, r_fill_state, r_command_start, command_num, command_count, nullptr, p_item, multiply_final_modulate); } // restore single rect fallback bdata.settings_use_single_rect_fallback = single_rect_fallback; return false; } T_PREAMBLE template bool C_PREAMBLE::_prefill_polygon(RasterizerCanvas::Item::CommandPolygon *p_poly, FillState &r_fill_state, int &r_command_start, int command_num, int command_count, RasterizerCanvas::Item *p_item, bool multiply_final_modulate) { bool change_batch = false; // conditions for creating a new batch if (r_fill_state.curr_batch->type != RasterizerStorageCommon::BT_POLY) { // don't allow joining to a different sequence type if (r_fill_state.sequence_batch_type_flags & (~RasterizerStorageCommon::BTF_POLY)) { // don't allow joining to a different sequence type r_command_start = command_num; return true; } r_fill_state.sequence_batch_type_flags |= RasterizerStorageCommon::BTF_POLY; change_batch = true; } int num_inds = p_poly->indices.size(); // nothing to draw? if (!num_inds || !p_poly->points.size()) { return false; } // we aren't using indices, so will transform verts more than once .. less efficient. // could be done with a temporary vertex buffer BatchVertex *bvs = bdata.vertices.request(num_inds); if (!bvs) { // run out of space in the vertex buffer // check for special case where the batching buffer is simply not big enough to fit this primitive. if (!bdata.vertices.size()) { // can't draw, ignore the primitive, otherwise we would enter an infinite loop WARN_PRINT_ONCE("poly has too many indices to draw, increase batch buffer size"); return false; } // .. finish this function and draw what we have so far // return where we got to r_command_start = command_num; return true; } BatchColor *vertex_colors = bdata.vertex_colors.request(num_inds); RAST_DEBUG_ASSERT(vertex_colors); // are we using large FVF? //////////////////////////////////// const bool use_large_verts = bdata.use_large_verts; const bool use_modulate = bdata.use_modulate; BatchColor *vertex_modulates = nullptr; if (use_modulate) { vertex_modulates = bdata.vertex_modulates.request(num_inds); RAST_DEBUG_ASSERT(vertex_modulates); // precalc the vertex modulate (will be shared by all verts) // we store the modulate as an attribute in the fvf rather than a uniform vertex_modulates[0].set(r_fill_state.final_modulate); } BatchTransform *pBT = nullptr; if (use_large_verts) { pBT = bdata.vertex_transforms.request(num_inds); RAST_DEBUG_ASSERT(pBT); // precalc the batch transform (will be shared by all verts) // we store the transform as an attribute in the fvf rather than a uniform const Transform2D &tr = r_fill_state.transform_combined; pBT[0].translate.set(tr.columns[2]); pBT[0].basis[0].set(tr.columns[0][0], tr.columns[0][1]); pBT[0].basis[1].set(tr.columns[1][0], tr.columns[1][1]); } //////////////////////////////////// // the modulate is always baked Color modulate; if (multiply_final_modulate) { modulate = r_fill_state.final_modulate; } else { modulate = Color(1, 1, 1, 1); } int old_batch_tex_id = r_fill_state.batch_tex_id; r_fill_state.batch_tex_id = _batch_find_or_create_tex(p_poly->texture, p_poly->normal_map, false, old_batch_tex_id); // conditions for creating a new batch if (old_batch_tex_id != r_fill_state.batch_tex_id) { change_batch = true; } // N.B. polygons don't have color thus don't need a batch change with color // This code is left as reference in case of problems. // if (!r_fill_state.curr_batch->color.equals(modulate)) { // change_batch = true; // bdata.total_color_changes++; // } if (change_batch) { // put the tex pixel size in a local (less verbose and can be a register) const BatchTex &batchtex = bdata.batch_textures[r_fill_state.batch_tex_id]; batchtex.tex_pixel_size.to(r_fill_state.texpixel_size); if (bdata.settings_uv_contract) { r_fill_state.contract_uvs = (batchtex.flags & RS::TEXTURE_FLAG_FILTER) == 0; } // open new batch (this should never fail, it dynamically grows) r_fill_state.curr_batch = _batch_request_new(false); r_fill_state.curr_batch->type = RasterizerStorageCommon::BT_POLY; // modulate unused except for debugging? r_fill_state.curr_batch->color.set(modulate); r_fill_state.curr_batch->batch_texture_id = r_fill_state.batch_tex_id; r_fill_state.curr_batch->first_command = command_num; r_fill_state.curr_batch->num_commands = num_inds; // r_fill_state.curr_batch->num_elements = num_inds; r_fill_state.curr_batch->first_vert = bdata.total_verts; } else { // we could alternatively do the count when closing a batch .. perhaps more efficient r_fill_state.curr_batch->num_commands += num_inds; } // PRECALCULATE THE COLORS (as there may be less colors than there are indices // in either hardware or software paths) BatchColor vcol; int num_verts = p_poly->points.size(); // in special cases, only 1 color is specified by convention, so we want to preset this // to use in all verts. if (p_poly->colors.size()) { vcol.set(p_poly->colors[0] * modulate); } else { // color is undefined, use modulate color straight vcol.set(modulate); } BatchColor *precalced_colors = (BatchColor *)alloca(num_verts * sizeof(BatchColor)); // two stage, super efficient setup of precalculated colors int num_colors_specified = p_poly->colors.size(); for (int n = 0; n < num_colors_specified; n++) { vcol.set(p_poly->colors[n] * modulate); precalced_colors[n] = vcol; } for (int n = num_colors_specified; n < num_verts; n++) { precalced_colors[n] = vcol; } bool software_transform = (r_fill_state.transform_mode != TM_NONE) && (!use_large_verts); for (int n = 0; n < num_inds; n++) { int ind = p_poly->indices[n]; DEV_CHECK_ONCE(ind < p_poly->points.size()); // recover at runtime from invalid polys (the editor may send invalid polys) if ((unsigned int)ind >= (unsigned int)num_verts) { // will recover as long as there is at least one vertex. // if there are no verts, we will have quick rejected earlier in this function ind = 0; } // this could be moved outside the loop if (software_transform) { Vector2 pos = p_poly->points[ind]; _software_transform_vertex(pos, r_fill_state.transform_combined); bvs[n].pos.set(pos.x, pos.y); } else { const Point2 &pos = p_poly->points[ind]; bvs[n].pos.set(pos.x, pos.y); } if (ind < p_poly->uvs.size()) { const Point2 &uv = p_poly->uvs[ind]; bvs[n].uv.set(uv.x, uv.y); } else { bvs[n].uv.set(0.0f, 0.0f); } vertex_colors[n] = precalced_colors[ind]; if (use_modulate) { vertex_modulates[n] = vertex_modulates[0]; } if (use_large_verts) { // reuse precalced transform (same for each vertex within polygon) pBT[n] = pBT[0]; } } // increment total vert count bdata.total_verts += num_inds; return false; } T_PREAMBLE template bool C_PREAMBLE::_prefill_rect(RasterizerCanvas::Item::CommandRect *rect, FillState &r_fill_state, int &r_command_start, int command_num, int command_count, RasterizerCanvas::Item::Command *const *commands, RasterizerCanvas::Item *p_item, bool multiply_final_modulate) { bool change_batch = false; // conditions for creating a new batch if (r_fill_state.curr_batch->type != RasterizerStorageCommon::BT_RECT) { // don't allow joining to a different sequence type if (r_fill_state.sequence_batch_type_flags & (~RasterizerStorageCommon::BTF_RECT)) { // don't allow joining to a different sequence type r_command_start = command_num; return true; } r_fill_state.sequence_batch_type_flags |= RasterizerStorageCommon::BTF_RECT; change_batch = true; // check for special case if there is only a single or small number of rects, // in which case we will use the legacy default rect renderer // because it is faster for single rects // we only want to do this if not a joined item with more than 1 item, // because joined items with more than 1, the command * will be incorrect // NOTE - this is assuming that use_hardware_transform means that it is a non-joined item!! // If that assumption is incorrect this will go horribly wrong. if (bdata.settings_use_single_rect_fallback && r_fill_state.is_single_item) { bool is_single_rect = false; int command_num_next = command_num + 1; if (command_num_next < command_count) { RasterizerCanvas::Item::Command *command_next = commands[command_num_next]; if ((command_next->type != RasterizerCanvas::Item::Command::TYPE_RECT) && (command_next->type != RasterizerCanvas::Item::Command::TYPE_TRANSFORM)) { is_single_rect = true; } } else { is_single_rect = true; } // if it is a rect on its own, do exactly the same as the default routine if (is_single_rect) { _prefill_default_batch(r_fill_state, command_num, *p_item); return false; } } // if use hardware transform } // try to create vertices BEFORE creating a batch, // because if the vertex buffer is full, we need to finish this // function, draw what we have so far, and then start a new set of batches // request FOUR vertices at a time, this is more efficient BatchVertex *bvs = bdata.vertices.request(4); if (!bvs) { // run out of space in the vertex buffer .. finish this function and draw what we have so far // return where we got to r_command_start = command_num; return true; } // are we using large FVF? const bool use_large_verts = bdata.use_large_verts; const bool use_modulate = bdata.use_modulate; Color col = rect->modulate; // use_modulate and use_large_verts should have been checked in the calling prefill_item function. // we don't want to apply the modulate on the CPU if it is stored in the vertex format, it will // be applied in the shader if (multiply_final_modulate) { col *= r_fill_state.final_modulate; } // instead of doing all the texture preparation for EVERY rect, // we build a list of texture combinations and do this once off. // This means we have a potentially rather slow step to identify which texture combo // using the RIDs. int old_batch_tex_id = r_fill_state.batch_tex_id; r_fill_state.batch_tex_id = _batch_find_or_create_tex(rect->texture, rect->normal_map, rect->flags & RasterizerCanvas::CANVAS_RECT_TILE, old_batch_tex_id); //r_fill_state.use_light_angles = send_light_angles; if (SEND_LIGHT_ANGLES) { bdata.use_light_angles = true; } // conditions for creating a new batch if (old_batch_tex_id != r_fill_state.batch_tex_id) { change_batch = true; } // we need to treat color change separately because we need to count these // to decide whether to switch on the fly to colored vertices. if (!change_batch && !r_fill_state.curr_batch->color.equals(col)) { change_batch = true; bdata.total_color_changes++; } if (change_batch) { // put the tex pixel size in a local (less verbose and can be a register) const BatchTex &batchtex = bdata.batch_textures[r_fill_state.batch_tex_id]; batchtex.tex_pixel_size.to(r_fill_state.texpixel_size); if (bdata.settings_uv_contract) { r_fill_state.contract_uvs = (batchtex.flags & RS::TEXTURE_FLAG_FILTER) == 0; } // need to preserve texpixel_size between items //r_fill_state.texpixel_size = r_fill_state.texpixel_size; // open new batch (this should never fail, it dynamically grows) r_fill_state.curr_batch = _batch_request_new(false); r_fill_state.curr_batch->type = RasterizerStorageCommon::BT_RECT; r_fill_state.curr_batch->color.set(col); r_fill_state.curr_batch->batch_texture_id = r_fill_state.batch_tex_id; r_fill_state.curr_batch->first_command = command_num; r_fill_state.curr_batch->num_commands = 1; //r_fill_state.curr_batch->first_quad = bdata.total_quads; r_fill_state.curr_batch->first_vert = bdata.total_verts; } else { // we could alternatively do the count when closing a batch .. perhaps more efficient r_fill_state.curr_batch->num_commands++; } // fill the quad geometry Vector2 mins = rect->rect.position; if (r_fill_state.transform_mode == TM_TRANSLATE) { if (!use_large_verts) { _software_transform_vertex(mins, r_fill_state.transform_combined); } } Vector2 maxs = mins + rect->rect.size; // just aliases BatchVertex *bA = &bvs[0]; BatchVertex *bB = &bvs[1]; BatchVertex *bC = &bvs[2]; BatchVertex *bD = &bvs[3]; bA->pos.x = mins.x; bA->pos.y = mins.y; bB->pos.x = maxs.x; bB->pos.y = mins.y; bC->pos.x = maxs.x; bC->pos.y = maxs.y; bD->pos.x = mins.x; bD->pos.y = maxs.y; // possibility of applying flips here for normal mapping .. but they don't seem to be used if (rect->rect.size.x < 0) { SWAP(bA->pos, bB->pos); SWAP(bC->pos, bD->pos); } if (rect->rect.size.y < 0) { SWAP(bA->pos, bD->pos); SWAP(bB->pos, bC->pos); } if (r_fill_state.transform_mode == TM_ALL) { if (!use_large_verts) { _software_transform_vertex(bA->pos, r_fill_state.transform_combined); _software_transform_vertex(bB->pos, r_fill_state.transform_combined); _software_transform_vertex(bC->pos, r_fill_state.transform_combined); _software_transform_vertex(bD->pos, r_fill_state.transform_combined); } } // uvs Vector2 src_min; Vector2 src_max; if (rect->flags & RasterizerCanvas::CANVAS_RECT_REGION) { src_min = rect->source.position; src_max = src_min + rect->source.size; src_min *= r_fill_state.texpixel_size; src_max *= r_fill_state.texpixel_size; const float uv_epsilon = bdata.settings_uv_contract_amount; // nudge offset for the maximum to prevent precision error on GPU reading into line outside the source rect // this is very difficult to get right. if (r_fill_state.contract_uvs) { src_min.x += uv_epsilon; src_min.y += uv_epsilon; src_max.x -= uv_epsilon; src_max.y -= uv_epsilon; } } else { src_min = Vector2(0, 0); src_max = Vector2(1, 1); } // 10% faster calculating the max first Vector2 uvs[4] = { src_min, Vector2(src_max.x, src_min.y), src_max, Vector2(src_min.x, src_max.y), }; // for encoding in light angle // flips should be optimized out when not being used for light angle. bool flip_h = false; bool flip_v = false; if (rect->flags & RasterizerCanvas::CANVAS_RECT_TRANSPOSE) { SWAP(uvs[1], uvs[3]); } if (rect->flags & RasterizerCanvas::CANVAS_RECT_FLIP_H) { SWAP(uvs[0], uvs[1]); SWAP(uvs[2], uvs[3]); flip_h = !flip_h; flip_v = !flip_v; } if (rect->flags & RasterizerCanvas::CANVAS_RECT_FLIP_V) { SWAP(uvs[0], uvs[3]); SWAP(uvs[1], uvs[2]); flip_v = !flip_v; } bA->uv.set(uvs[0]); bB->uv.set(uvs[1]); bC->uv.set(uvs[2]); bD->uv.set(uvs[3]); // modulate if (use_modulate) { // store the final modulate separately from the rect modulate BatchColor *pBC = bdata.vertex_modulates.request(4); RAST_DEBUG_ASSERT(pBC); pBC[0].set(r_fill_state.final_modulate); pBC[1] = pBC[0]; pBC[2] = pBC[0]; pBC[3] = pBC[0]; } if (use_large_verts) { // store the transform separately BatchTransform *pBT = bdata.vertex_transforms.request(4); RAST_DEBUG_ASSERT(pBT); const Transform2D &tr = r_fill_state.transform_combined; pBT[0].translate.set(tr.columns[2]); pBT[0].basis[0].set(tr.columns[0][0], tr.columns[0][1]); pBT[0].basis[1].set(tr.columns[1][0], tr.columns[1][1]); pBT[1] = pBT[0]; pBT[2] = pBT[0]; pBT[3] = pBT[0]; } if (SEND_LIGHT_ANGLES) { // we can either keep the light angles in sync with the verts when writing, // or sync them up during translation. We are syncing in translation. // N.B. There may be batches that don't require light_angles between batches that do. float *angles = bdata.light_angles.request(4); RAST_DEBUG_ASSERT(angles); float angle = 0.0f; const float TWO_PI = Math_PI * 2; if (r_fill_state.transform_mode != TM_NONE) { const Transform2D &tr = r_fill_state.transform_combined; // apply to an x axis // the x axis and y axis can be taken directly from the transform (no need to xform identity vectors) Vector2 x_axis(tr.columns[0][0], tr.columns[0][1]); // have to do a y axis to check for scaling flips // this is hassle and extra slowness. We could only allow flips via the flags. Vector2 y_axis(tr.columns[1][0], tr.columns[1][1]); // has the x / y axis flipped due to scaling? float cross = x_axis.cross(y_axis); if (cross < 0.0f) { flip_v = !flip_v; } // passing an angle is smaller than a vector, it can be reconstructed in the shader angle = x_axis.angle(); // we don't want negative angles, as negative is used to encode flips. // This moves range from -PI to PI to 0 to TWO_PI if (angle < 0.0f) { angle += TWO_PI; } } // if transform needed // if horizontal flip, angle is shifted by 180 degrees if (flip_h) { angle += Math_PI; // mod to get back to 0 to TWO_PI range angle = fmodf(angle, TWO_PI); } // add 1 (to take care of zero floating point error with sign) angle += 1.0f; // flip if necessary to indicate a vertical flip in the shader if (flip_v) { angle *= -1.0f; } // light angle must be sent for each vert, instead as a single uniform in the uniform draw method // this has the benefit of enabling batching with light angles. for (int n = 0; n < 4; n++) { angles[n] = angle; } } // increment quad count bdata.total_quads++; bdata.total_verts += 4; return false; } T_PREAMBLE template bool C_PREAMBLE::_prefill_multirect(RasterizerCanvas::Item::CommandMultiRect *mrect, FillState &r_fill_state, int &r_command_start, int command_num, bool multiply_final_modulate) { bool change_batch = false; // conditions for creating a new batch if (r_fill_state.curr_batch->type != RasterizerStorageCommon::BT_RECT) { // don't allow joining to a different sequence type if (r_fill_state.sequence_batch_type_flags & (~RasterizerStorageCommon::BTF_RECT)) { // don't allow joining to a different sequence type r_command_start = command_num; return true; } r_fill_state.sequence_batch_type_flags |= RasterizerStorageCommon::BTF_RECT; change_batch = true; } // try to create vertices BEFORE creating a batch, // because if the vertex buffer is full, we need to finish this // function, draw what we have so far, and then start a new set of batches // request ALL vertices at a time, this is more efficient uint32_t total_verts = 4 * mrect->rects.size(); BatchVertex *bvs = bdata.vertices.request(total_verts); if (!bvs) { // run out of space in the vertex buffer .. finish this function and draw what we have so far // return where we got to r_command_start = command_num; // Check for an error condition - if we have been creating MultiRects that require more than // the maximum number of verts in the buffer, this could cause an infinite loop. ERR_FAIL_COND_V(total_verts > bdata.vertex_buffer_size_units, false); return true; } // are we using large FVF? const bool use_large_verts = bdata.use_large_verts; const bool use_modulate = bdata.use_modulate; Color col = mrect->modulate; // use_modulate and use_large_verts should have been checked in the calling prefill_item function. // we don't want to apply the modulate on the CPU if it is stored in the vertex format, it will // be applied in the shader if (multiply_final_modulate) { col *= r_fill_state.final_modulate; } // instead of doing all the texture preparation for EVERY rect, // we build a list of texture combinations and do this once off. // This means we have a potentially rather slow step to identify which texture combo // using the RIDs. int old_batch_tex_id = r_fill_state.batch_tex_id; r_fill_state.batch_tex_id = _batch_find_or_create_tex(mrect->texture, mrect->normal_map, mrect->flags & RasterizerCanvas::CANVAS_RECT_TILE, old_batch_tex_id); //r_fill_state.use_light_angles = send_light_angles; if (SEND_LIGHT_ANGLES) { bdata.use_light_angles = true; } // conditions for creating a new batch if (old_batch_tex_id != r_fill_state.batch_tex_id) { change_batch = true; } // we need to treat color change separately because we need to count these // to decide whether to switch on the fly to colored vertices. if (!change_batch && !r_fill_state.curr_batch->color.equals(col)) { change_batch = true; bdata.total_color_changes++; } uint32_t num_rects = mrect->rects.size(); if (change_batch) { // put the tex pixel size in a local (less verbose and can be a register) const BatchTex &batchtex = bdata.batch_textures[r_fill_state.batch_tex_id]; batchtex.tex_pixel_size.to(r_fill_state.texpixel_size); if (bdata.settings_uv_contract) { r_fill_state.contract_uvs = (batchtex.flags & RS::TEXTURE_FLAG_FILTER) == 0; } // need to preserve texpixel_size between items //r_fill_state.texpixel_size = r_fill_state.texpixel_size; // open new batch (this should never fail, it dynamically grows) r_fill_state.curr_batch = _batch_request_new(false); r_fill_state.curr_batch->type = RasterizerStorageCommon::BT_RECT; r_fill_state.curr_batch->color.set(col); r_fill_state.curr_batch->batch_texture_id = r_fill_state.batch_tex_id; r_fill_state.curr_batch->first_command = command_num; r_fill_state.curr_batch->num_commands = num_rects; //r_fill_state.curr_batch->first_quad = bdata.total_quads; r_fill_state.curr_batch->first_vert = bdata.total_verts; } else { // we could alternatively do the count when closing a batch .. perhaps more efficient r_fill_state.curr_batch->num_commands += num_rects; } // test for simplified pipeline const uint8_t disallow_flags = RasterizerCanvas::CANVAS_RECT_TRANSPOSE | RasterizerCanvas::CANVAS_RECT_FLIP_H | RasterizerCanvas::CANVAS_RECT_FLIP_V; if ((mrect->flags & RasterizerCanvas::CANVAS_RECT_REGION) && ((mrect->flags & disallow_flags) == 0)) { // simplified pipeline for (uint32_t n = 0; n < num_rects; n++) { const Rect2 &rect = mrect->rects[n]; const Rect2 &source = mrect->sources[n]; // fill the quad geometry Vector2 mins = rect.position; // just aliases BatchVertex *bA = &bvs[0]; BatchVertex *bB = &bvs[1]; BatchVertex *bC = &bvs[2]; BatchVertex *bD = &bvs[3]; // possibility of applying flips here for normal mapping .. but they don't seem to be used #ifdef TOOLS_ENABLED if (rect.size.x < 0) { ERR_PRINT_ONCE("MultiRect with negative size detected. Ensure rects are non-negative."); } if (rect.size.y < 0) { ERR_PRINT_ONCE("MultiRect with negative size detected. Ensure rects are non-negative."); } #endif if (r_fill_state.transform_mode == TM_TRANSLATE) { if (!use_large_verts) { _software_transform_vertex(mins, r_fill_state.transform_combined); } } Vector2 maxs = mins + rect.size; bA->pos.x = mins.x; bA->pos.y = mins.y; bB->pos.x = maxs.x; bB->pos.y = mins.y; bC->pos.x = maxs.x; bC->pos.y = maxs.y; bD->pos.x = mins.x; bD->pos.y = maxs.y; if (r_fill_state.transform_mode == TM_ALL) { if (!use_large_verts) { _software_transform_vertex(bA->pos, r_fill_state.transform_combined); _software_transform_vertex(bB->pos, r_fill_state.transform_combined); _software_transform_vertex(bC->pos, r_fill_state.transform_combined); _software_transform_vertex(bD->pos, r_fill_state.transform_combined); } } // uvs Vector2 src_min; Vector2 src_max; src_min = source.position; src_max = src_min + source.size; src_min *= r_fill_state.texpixel_size; src_max *= r_fill_state.texpixel_size; const float uv_epsilon = bdata.settings_uv_contract_amount; // nudge offset for the maximum to prevent precision error on GPU reading into line outside the source rect // this is very difficult to get right. if (r_fill_state.contract_uvs) { src_min.x += uv_epsilon; src_min.y += uv_epsilon; src_max.x -= uv_epsilon; src_max.y -= uv_epsilon; } // 10% faster calculating the max first Vector2 uvs[4] = { src_min, Vector2(src_max.x, src_min.y), src_max, Vector2(src_min.x, src_max.y), }; bA->uv.set(uvs[0]); bB->uv.set(uvs[1]); bC->uv.set(uvs[2]); bD->uv.set(uvs[3]); bvs += 4; // move the destination verts on by 4 each rect } // for n through rects } else { // full pipeline for (uint32_t n = 0; n < num_rects; n++) { const Rect2 &rect = mrect->rects[n]; const Rect2 &source = mrect->sources[n]; // fill the quad geometry Vector2 mins = rect.position; if (r_fill_state.transform_mode == TM_TRANSLATE) { if (!use_large_verts) { _software_transform_vertex(mins, r_fill_state.transform_combined); } } Vector2 maxs = mins + rect.size; // just aliases BatchVertex *bA = &bvs[0]; BatchVertex *bB = &bvs[1]; BatchVertex *bC = &bvs[2]; BatchVertex *bD = &bvs[3]; bA->pos.x = mins.x; bA->pos.y = mins.y; bB->pos.x = maxs.x; bB->pos.y = mins.y; bC->pos.x = maxs.x; bC->pos.y = maxs.y; bD->pos.x = mins.x; bD->pos.y = maxs.y; // possibility of applying flips here for normal mapping .. but they don't seem to be used #ifdef TOOLS_ENABLED if (rect.size.x < 0) { //SWAP(bA->pos, bB->pos); //SWAP(bC->pos, bD->pos); ERR_PRINT_ONCE("MultiRect with negative size detected. Ensure rects are non-negative."); } if (rect.size.y < 0) { //SWAP(bA->pos, bD->pos); //SWAP(bB->pos, bC->pos); ERR_PRINT_ONCE("MultiRect with negative size detected. Ensure rects are non-negative."); } #endif if (r_fill_state.transform_mode == TM_ALL) { if (!use_large_verts) { _software_transform_vertex(bA->pos, r_fill_state.transform_combined); _software_transform_vertex(bB->pos, r_fill_state.transform_combined); _software_transform_vertex(bC->pos, r_fill_state.transform_combined); _software_transform_vertex(bD->pos, r_fill_state.transform_combined); } } // uvs Vector2 src_min; Vector2 src_max; if (mrect->flags & RasterizerCanvas::CANVAS_RECT_REGION) { src_min = source.position; src_max = src_min + source.size; src_min *= r_fill_state.texpixel_size; src_max *= r_fill_state.texpixel_size; const float uv_epsilon = bdata.settings_uv_contract_amount; // nudge offset for the maximum to prevent precision error on GPU reading into line outside the source rect // this is very difficult to get right. if (r_fill_state.contract_uvs) { src_min.x += uv_epsilon; src_min.y += uv_epsilon; src_max.x -= uv_epsilon; src_max.y -= uv_epsilon; } } else { src_min = Vector2(0, 0); src_max = Vector2(1, 1); } // 10% faster calculating the max first Vector2 uvs[4] = { src_min, Vector2(src_max.x, src_min.y), src_max, Vector2(src_min.x, src_max.y), }; if (mrect->flags & RasterizerCanvas::CANVAS_RECT_TRANSPOSE) { SWAP(uvs[1], uvs[3]); } if (mrect->flags & RasterizerCanvas::CANVAS_RECT_FLIP_H) { SWAP(uvs[0], uvs[1]); SWAP(uvs[2], uvs[3]); } if (mrect->flags & RasterizerCanvas::CANVAS_RECT_FLIP_V) { SWAP(uvs[0], uvs[3]); SWAP(uvs[1], uvs[2]); } bA->uv.set(uvs[0]); bB->uv.set(uvs[1]); bC->uv.set(uvs[2]); bD->uv.set(uvs[3]); bvs += 4; // move the destination verts on by 4 each rect } // for n through rects } // full pipeline // modulate if (use_modulate) { // store the final modulate separately from the rect modulate BatchColor *pBC = bdata.vertex_modulates.request(total_verts); RAST_DEBUG_ASSERT(pBC); pBC[0].set(r_fill_state.final_modulate); for (uint32_t n = 1; n < total_verts; n++) { pBC[n] = pBC[0]; } } // they will all have the same vertex transforms if (use_large_verts) { // store the transform separately BatchTransform *pBT = bdata.vertex_transforms.request(total_verts); RAST_DEBUG_ASSERT(pBT); BatchTransform *pBT_first = pBT; const Transform2D &tr = r_fill_state.transform_combined; pBT[0].translate.set(tr.columns[2]); pBT[0].basis[0].set(tr.columns[0][0], tr.columns[0][1]); pBT[0].basis[1].set(tr.columns[1][0], tr.columns[1][1]); for (uint32_t n = 1; n < num_rects * 4; n++) { pBT++; *pBT = *pBT_first; } } if (SEND_LIGHT_ANGLES) { // SAME FOR ALL // for encoding in light angle bool flip_h = false; bool flip_v = false; if (mrect->flags & RasterizerCanvas::CANVAS_RECT_FLIP_H) { flip_h = !flip_h; flip_v = !flip_v; } if (mrect->flags & RasterizerCanvas::CANVAS_RECT_FLIP_V) { flip_v = !flip_v; } // we can either keep the light angles in sync with the verts when writing, // or sync them up during translation. We are syncing in translation. // N.B. There may be batches that don't require light_angles between batches that do. float *angles = bdata.light_angles.request(total_verts); RAST_DEBUG_ASSERT(angles); float angle = 0.0f; const float TWO_PI = Math_PI * 2; if (r_fill_state.transform_mode != TM_NONE) { const Transform2D &tr = r_fill_state.transform_combined; // apply to an x axis // the x axis and y axis can be taken directly from the transform (no need to xform identity vectors) Vector2 x_axis(tr.columns[0][0], tr.columns[0][1]); // have to do a y axis to check for scaling flips // this is hassle and extra slowness. We could only allow flips via the flags. Vector2 y_axis(tr.columns[1][0], tr.columns[1][1]); // has the x / y axis flipped due to scaling? float cross = x_axis.cross(y_axis); if (cross < 0.0f) { flip_v = !flip_v; } // passing an angle is smaller than a vector, it can be reconstructed in the shader angle = x_axis.angle(); // we don't want negative angles, as negative is used to encode flips. // This moves range from -PI to PI to 0 to TWO_PI if (angle < 0.0f) { angle += TWO_PI; } } // if transform needed // if horizontal flip, angle is shifted by 180 degrees if (flip_h) { angle += Math_PI; // mod to get back to 0 to TWO_PI range angle = fmodf(angle, TWO_PI); } // add 1 (to take care of zero floating point error with sign) angle += 1.0f; // flip if necessary to indicate a vertical flip in the shader if (flip_v) { angle *= -1.0f; } // light angle must be sent for each vert, instead as a single uniform in the uniform draw method // this has the benefit of enabling batching with light angles. for (uint32_t n = 0; n < total_verts; n++) { angles[n] = angle; } } // increment quad count bdata.total_quads += num_rects; bdata.total_verts += total_verts; return false; } // This function may be called MULTIPLE TIMES for each item, so needs to record how far it has got PREAMBLE(bool)::prefill_joined_item(FillState &r_fill_state, int &r_command_start, RasterizerCanvas::Item *p_item, RasterizerCanvas::Item *p_current_clip, bool &r_reclip, typename T_STORAGE::Material *p_material) { // we will prefill batches and vertices ready for sending in one go to the vertex buffer int command_count = p_item->commands.size(); RasterizerCanvas::Item::Command *const *commands = p_item->commands.ptr(); // whether to multiply final modulate on the CPU, or pass it in the FVF and apply in the shader bool multiply_final_modulate = true; if (r_fill_state.is_single_item || bdata.use_modulate || bdata.use_large_verts) { multiply_final_modulate = false; } // start batch is a dummy batch (tex id -1) .. could be made more efficient if (!r_fill_state.curr_batch) { // allocate dummy batch on the stack, it should always get replaced // note that the rest of the structure is uninitialized, this should not matter // if the type is checked before anything else. r_fill_state.curr_batch = (Batch *)alloca(sizeof(Batch)); r_fill_state.curr_batch->type = RasterizerStorageCommon::BT_DUMMY; // this is assumed to be the case //CRASH_COND (r_fill_state.transform_extra_command_number_p1); } // we need to return which command we got up to, so // store this outside the loop int command_num; // do as many commands as possible until the vertex buffer will be full up for (command_num = r_command_start; command_num < command_count; command_num++) { RasterizerCanvas::Item::Command *command = commands[command_num]; switch (command->type) { default: { _prefill_default_batch(r_fill_state, command_num, *p_item); } break; case RasterizerCanvas::Item::Command::TYPE_TRANSFORM: { // if the extra matrix has been sent already, // break this extra matrix software path (as we don't want to unset it on the GPU etc) if (r_fill_state.extra_matrix_sent) { _prefill_default_batch(r_fill_state, command_num, *p_item); // keep track of the combined matrix on the CPU in parallel, in case we use large vertex format RasterizerCanvas::Item::CommandTransform *transform = static_cast(command); const Transform2D &extra_matrix = transform->xform; r_fill_state.transform_combined = p_item->final_transform * extra_matrix; } else { // Extra matrix fast path. // Instead of sending the command immediately, we store the modified transform (in combined) // for software transform, and only flush this transform command if we NEED to (i.e. we want to // render some default commands) RasterizerCanvas::Item::CommandTransform *transform = static_cast(command); const Transform2D &extra_matrix = transform->xform; if (r_fill_state.is_single_item && !r_fill_state.use_attrib_transform) { // if we are using hardware transform mode, we have already sent the final transform, // so we only want to software transform the extra matrix r_fill_state.transform_combined = extra_matrix; } else { r_fill_state.transform_combined = p_item->final_transform * extra_matrix; } // after a transform command, always use some form of software transform (either the combined final + extra, or just the extra) // until we flush this dirty extra matrix because we need to render default commands. r_fill_state.transform_mode = _find_transform_mode(r_fill_state.transform_combined); // make a note of which command the dirty extra matrix is store in, so we can send it later // if necessary r_fill_state.transform_extra_command_number_p1 = command_num + 1; // plus 1 so we can test against zero } } break; case RasterizerCanvas::Item::Command::TYPE_RECT: { RasterizerCanvas::Item::CommandRect *rect = static_cast(command); // unoptimized - could this be done once per batch / batch texture? bool send_light_angles = rect->normal_map != RID(); bool buffer_full = false; // the template params must be explicit for compilation, // this forces building the multiple versions of the function. if (send_light_angles) { buffer_full = _prefill_rect(rect, r_fill_state, r_command_start, command_num, command_count, commands, p_item, multiply_final_modulate); } else { buffer_full = _prefill_rect(rect, r_fill_state, r_command_start, command_num, command_count, commands, p_item, multiply_final_modulate); } if (buffer_full) { return true; } } break; case RasterizerCanvas::Item::Command::TYPE_MULTIRECT: { RasterizerCanvas::Item::CommandMultiRect *mrect = static_cast(command); // MultRects with no rects should ideally not be created ERR_CONTINUE(!mrect->rects.size()); bool send_light_angles = mrect->normal_map != RID(); bool buffer_full = false; // the template params must be explicit for compilation, // this forces building the multiple versions of the function. if (send_light_angles) { buffer_full = _prefill_multirect(mrect, r_fill_state, r_command_start, command_num, multiply_final_modulate); } else { buffer_full = _prefill_multirect(mrect, r_fill_state, r_command_start, command_num, multiply_final_modulate); } if (buffer_full) { return true; } } break; case RasterizerCanvas::Item::Command::TYPE_NINEPATCH: { RasterizerCanvas::Item::CommandNinePatch *np = static_cast(command); if ((np->axis_x != RenderingServer::NINE_PATCH_STRETCH) || (np->axis_y != RenderingServer::NINE_PATCH_STRETCH)) { // not accelerated _prefill_default_batch(r_fill_state, command_num, *p_item); continue; } // unoptimized - could this be done once per batch / batch texture? bool send_light_angles = np->normal_map != RID(); bool buffer_full = false; if (send_light_angles) { buffer_full = _prefill_ninepatch(np, r_fill_state, r_command_start, command_num, command_count, p_item, multiply_final_modulate); } else { buffer_full = _prefill_ninepatch(np, r_fill_state, r_command_start, command_num, command_count, p_item, multiply_final_modulate); } if (buffer_full) { return true; } } break; case RasterizerCanvas::Item::Command::TYPE_LINE: { RasterizerCanvas::Item::CommandLine *line = static_cast(command); if (line->width <= 1) { bool buffer_full = _prefill_line(line, r_fill_state, r_command_start, command_num, command_count, p_item, multiply_final_modulate); if (buffer_full) { return true; } } else { // not accelerated _prefill_default_batch(r_fill_state, command_num, *p_item); } } break; case RasterizerCanvas::Item::Command::TYPE_POLYGON: { RasterizerCanvas::Item::CommandPolygon *polygon = static_cast(command); // unoptimized - could this be done once per batch / batch texture? bool send_light_angles = polygon->normal_map != RID(); bool buffer_full = false; if (send_light_angles) { // polygon with light angles is not yet implemented // for batching .. this means software skinned with light angles won't work _prefill_default_batch(r_fill_state, command_num, *p_item); } else { buffer_full = _prefill_polygon(polygon, r_fill_state, r_command_start, command_num, command_count, p_item, multiply_final_modulate); } if (buffer_full) { return true; } } break; } } // VERY IMPORTANT to return where we got to, because this func may be called multiple // times per item. // Don't miss out on this step by calling return earlier in the function without setting r_command_start. r_command_start = command_num; return false; } PREAMBLE(void)::flush_render_batches(RasterizerCanvas::Item *p_first_item, RasterizerCanvas::Item *p_current_clip, bool &r_reclip, typename T_STORAGE::Material *p_material, uint32_t p_sequence_batch_type_flags) { // some heuristic to decide whether to use colored verts. // feel free to tweak this. // this could use hysteresis, to prevent jumping between methods // .. however probably not necessary bdata.use_colored_vertices = false; RasterizerStorageCommon::FVF backup_fvf = bdata.fvf; // the batch type in this flush can override the fvf from the joined item. // The joined item uses the material to determine fvf, assuming a rect... // however with custom drawing, lines or polys may be drawn. // lines contain no color (this is stored in the batch), and polys contain vertex and color only. if (p_sequence_batch_type_flags & (RasterizerStorageCommon::BTF_LINE | RasterizerStorageCommon::BTF_LINE_AA)) { // do nothing, use the default regular FVF bdata.fvf = RasterizerStorageCommon::FVF_REGULAR; } else { // switch from regular to colored? if (bdata.fvf == RasterizerStorageCommon::FVF_REGULAR) { // only check whether to convert if there are quads (prevent divide by zero) // and we haven't decided to prevent color baking (due to e.g. MODULATE // being used in a shader) if (bdata.total_quads && !(bdata.joined_item_batch_flags & RasterizerStorageCommon::PREVENT_COLOR_BAKING)) { // minus 1 to prevent single primitives (ratio 1.0) always being converted to colored.. // in that case it is slightly cheaper to just have the color as part of the batch float ratio = (float)(bdata.total_color_changes - 1) / (float)bdata.total_quads; // use bigger than or equal so that 0.0 threshold can force always using colored verts if (ratio >= bdata.settings_colored_vertex_format_threshold) { bdata.use_colored_vertices = true; bdata.fvf = RasterizerStorageCommon::FVF_COLOR; } } // if we used vertex colors if (bdata.vertex_colors.size()) { bdata.use_colored_vertices = true; bdata.fvf = RasterizerStorageCommon::FVF_COLOR; } // needs light angles? if (bdata.use_light_angles) { bdata.fvf = RasterizerStorageCommon::FVF_LIGHT_ANGLE; } } backup_fvf = bdata.fvf; } // if everything else except lines // translate if required to larger FVFs switch (bdata.fvf) { case RasterizerStorageCommon::FVF_UNBATCHED: // should not happen break; case RasterizerStorageCommon::FVF_REGULAR: // no change break; case RasterizerStorageCommon::FVF_COLOR: { // special case, where vertex colors are used (polys) if (!bdata.vertex_colors.size()) { _translate_batches_to_larger_FVF(p_sequence_batch_type_flags); } else { // normal, reduce number of batches by baking batch colors _translate_batches_to_vertex_colored_FVF(); } } break; case RasterizerStorageCommon::FVF_LIGHT_ANGLE: _translate_batches_to_larger_FVF(p_sequence_batch_type_flags); break; case RasterizerStorageCommon::FVF_MODULATED: _translate_batches_to_larger_FVF(p_sequence_batch_type_flags); break; case RasterizerStorageCommon::FVF_LARGE: _translate_batches_to_larger_FVF(p_sequence_batch_type_flags); break; } // send buffers to opengl get_this()->_batch_upload_buffers(); #if defined(TOOLS_ENABLED) && defined(DEBUG_ENABLED) if (bdata.diagnose_frame) { RasterizerCanvas::Item::Command *const *commands = p_first_item->commands.ptr(); diagnose_batches(commands); } #endif get_this()->render_batches(p_current_clip, r_reclip, p_material); // if we overrode the fvf for lines, set it back to the joined item fvf bdata.fvf = backup_fvf; // overwrite source buffers with garbage if error checking #ifdef RASTERIZER_EXTRA_CHECKS _debug_write_garbage(); #endif } PREAMBLE(void)::render_joined_item_commands(const BItemJoined &p_bij, RasterizerCanvas::Item *p_current_clip, bool &r_reclip, typename T_STORAGE::Material *p_material, bool p_lit, const RenderItemState &p_ris) { RasterizerCanvas::Item *item = nullptr; RasterizerCanvas::Item *first_item = bdata.item_refs[p_bij.first_item_ref].item; // fill_state and bdata have once off setup per joined item, and a smaller reset on flush FillState fill_state; fill_state.reset_joined_item(p_bij.is_single_item(), p_bij.use_attrib_transform()); bdata.reset_joined_item(); // should this joined item be using large FVF? if (p_bij.flags & RasterizerStorageCommon::USE_MODULATE_FVF) { bdata.use_modulate = true; bdata.fvf = RasterizerStorageCommon::FVF_MODULATED; } if (p_bij.flags & RasterizerStorageCommon::USE_LARGE_FVF) { bdata.use_modulate = true; bdata.use_large_verts = true; bdata.fvf = RasterizerStorageCommon::FVF_LARGE; } // make sure the jointed item flags state is up to date, as it is read indirectly in // a couple of places from the state rather than from the joined item. // we could alternatively make sure to only read directly from the joined item // during the render, but it is probably more bug future proof to make sure both // are up to date. bdata.joined_item_batch_flags = p_bij.flags; // in the special case of custom shaders that read from VERTEX (i.e. vertex position) // we want to disable software transform of extra matrix if (bdata.joined_item_batch_flags & RasterizerStorageCommon::PREVENT_VERTEX_BAKING) { fill_state.extra_matrix_sent = true; } for (unsigned int i = 0; i < p_bij.num_item_refs; i++) { const BItemRef &ref = bdata.item_refs[p_bij.first_item_ref + i]; item = ref.item; if (!p_lit) { // if not lit we use the complex calculated final modulate fill_state.final_modulate = ref.final_modulate; } else { // if lit we ignore canvas modulate and just use the item modulate fill_state.final_modulate = item->final_modulate; } int command_count = item->commands.size(); int command_start = 0; // ONCE OFF fill state setup, that will be retained over multiple calls to // prefill_joined_item() fill_state.transform_combined = item->final_transform; // decide the initial transform mode, and make a backup // in orig_transform_mode in case we need to switch back if (fill_state.use_software_transform) { fill_state.transform_mode = _find_transform_mode(fill_state.transform_combined); } else { fill_state.transform_mode = TM_NONE; } fill_state.orig_transform_mode = fill_state.transform_mode; // keep track of when we added an extra matrix // so we can defer sending until we see a default command fill_state.transform_extra_command_number_p1 = 0; while (command_start < command_count) { // fill as many batches as possible (until all done, or the vertex buffer is full) bool bFull = get_this()->prefill_joined_item(fill_state, command_start, item, p_current_clip, r_reclip, p_material); if (bFull) { // always pass first item (commands for default are always first item) flush_render_batches(first_item, p_current_clip, r_reclip, p_material, fill_state.sequence_batch_type_flags); // zero all the batch data ready for a new run bdata.reset_flush(); // don't zero all the fill state, some may need to be preserved fill_state.reset_flush(); } } } // flush if any left flush_render_batches(first_item, p_current_clip, r_reclip, p_material, fill_state.sequence_batch_type_flags); // zero all the batch data ready for a new run bdata.reset_flush(); } PREAMBLE(void)::_legacy_canvas_item_render_commands(RasterizerCanvas::Item *p_item, RasterizerCanvas::Item *p_current_clip, bool &r_reclip, typename T_STORAGE::Material *p_material) { int command_count = p_item->commands.size(); // legacy .. just create one massive batch and render everything as before bdata.batches.reset(); Batch *batch = _batch_request_new(); batch->type = RasterizerStorageCommon::BT_DEFAULT; batch->num_commands = command_count; batch->item = p_item; get_this()->render_batches(p_current_clip, r_reclip, p_material); bdata.reset_flush(); } PREAMBLE(void)::record_items(RasterizerCanvas::Item *p_item_list, int p_z) { while (p_item_list) { BSortItem *s = bdata.sort_items.request_with_grow(); s->item = p_item_list; s->z_index = p_z; p_item_list = p_item_list->next; } } PREAMBLE(void)::join_sorted_items() { sort_items(); int z = RS::CANVAS_ITEM_Z_MIN; _render_item_state.item_group_z = z; for (int s = 0; s < bdata.sort_items.size(); s++) { const BSortItem &si = bdata.sort_items[s]; RasterizerCanvas::Item *ci = si.item; // change z? if (si.z_index != z) { z = si.z_index; // may not be required _render_item_state.item_group_z = z; // if z ranged lights are present, sometimes we have to disable joining over z_indices. // we do this here. // Note this restriction may be able to be relaxed with light bitfields, investigate! if (!bdata.join_across_z_indices) { _render_item_state.join_batch_break = true; } } bool join; if (_render_item_state.join_batch_break) { // always start a new batch for this item join = false; // could be another batch break (i.e. prevent NEXT item from joining this) // so we still need to run try_join_item // even though we know join is false. // also we need to run try_join_item for every item because it keeps the state up to date, // if we didn't run it the state would be out of date. get_this()->try_join_item(ci, _render_item_state, _render_item_state.join_batch_break); } else { join = get_this()->try_join_item(ci, _render_item_state, _render_item_state.join_batch_break); } // assume the first item will always return no join if (!join) { _render_item_state.joined_item = bdata.items_joined.request_with_grow(); _render_item_state.joined_item->first_item_ref = bdata.item_refs.size(); _render_item_state.joined_item->num_item_refs = 1; _render_item_state.joined_item->bounding_rect = ci->global_rect_cache; _render_item_state.joined_item->z_index = z; _render_item_state.joined_item->flags = bdata.joined_item_batch_flags; // we need some logic to prevent joining items that have vastly different batch types _render_item_state.joined_item_batch_type_flags_prev = _render_item_state.joined_item_batch_type_flags_curr; // add the reference BItemRef *r = bdata.item_refs.request_with_grow(); r->item = ci; // we are storing final_modulate in advance per item reference // for baking into vertex colors. // this may not be ideal... as we are increasing the size of item reference, // but it is stupidly complex to calculate later, which would probably be slower. r->final_modulate = _render_item_state.final_modulate; } else { DEV_ASSERT(_render_item_state.joined_item != nullptr); _render_item_state.joined_item->num_item_refs += 1; _render_item_state.joined_item->bounding_rect = _render_item_state.joined_item->bounding_rect.merge(ci->global_rect_cache); BItemRef *r = bdata.item_refs.request_with_grow(); r->item = ci; r->final_modulate = _render_item_state.final_modulate; // joined item references may introduce new flags _render_item_state.joined_item->flags |= bdata.joined_item_batch_flags; } } // for s through sort items } PREAMBLE(void)::sort_items() { // turned off? if (!bdata.settings_item_reordering_lookahead) { return; } for (int s = 0; s < bdata.sort_items.size() - 2; s++) { if (sort_items_from(s)) { #if defined(TOOLS_ENABLED) && defined(DEBUG_ENABLED) bdata.stats_items_sorted++; #endif } } } PREAMBLE(bool)::_sort_items_match(const BSortItem &p_a, const BSortItem &p_b) const { const RasterizerCanvas::Item *a = p_a.item; const RasterizerCanvas::Item *b = p_b.item; if (b->commands.size() != 1) { return false; } // tested outside function // if (a->commands.size() != 1) // return false; const RasterizerCanvas::Item::Command &cb = *b->commands[0]; if ((cb.type != RasterizerCanvas::Item::Command::TYPE_RECT) && (cb.type != RasterizerCanvas::Item::Command::TYPE_MULTIRECT)) { return false; } const RasterizerCanvas::Item::Command &ca = *a->commands[0]; // tested outside function // if (ca.type != Item::Command::TYPE_RECT) // return false; const RasterizerCanvas::Item::CommandRect *rect_a = static_cast(&ca); const RasterizerCanvas::Item::CommandRect *rect_b = static_cast(&cb); if (rect_a->texture != rect_b->texture) { return false; } /* ALTERNATIVE APPROACH NOT LIMITED TO RECTS const RasterizerCanvas::Item::Command &ca = *a->commands[0]; const RasterizerCanvas::Item::Command &cb = *b->commands[0]; if (ca.type != cb.type) return false; // do textures match? switch (ca.type) { default: break; case RasterizerCanvas::Item::Command::TYPE_RECT: { const RasterizerCanvas::Item::CommandRect *comm_a = static_cast(&ca); const RasterizerCanvas::Item::CommandRect *comm_b = static_cast(&cb); if (comm_a->texture != comm_b->texture) return false; } break; case RasterizerCanvas::Item::Command::TYPE_POLYGON: { const RasterizerCanvas::Item::CommandPolygon *comm_a = static_cast(&ca); const RasterizerCanvas::Item::CommandPolygon *comm_b = static_cast(&cb); if (comm_a->texture != comm_b->texture) return false; } break; } */ return true; } PREAMBLE(bool)::sort_items_from(int p_start) { #if defined(TOOLS_ENABLED) && defined(DEBUG_ENABLED) ERR_FAIL_COND_V((p_start + 1) >= bdata.sort_items.size(), false); #endif const BSortItem &start = bdata.sort_items[p_start]; int start_z = start.z_index; // check start is the right type for sorting if (start.item->commands.size() != 1) { return false; } const RasterizerCanvas::Item::Command &command_start = *start.item->commands[0]; if ((command_start.type != RasterizerCanvas::Item::Command::TYPE_RECT) && (command_start.type != RasterizerCanvas::Item::Command::TYPE_MULTIRECT)) { return false; } BSortItem &second = bdata.sort_items[p_start + 1]; if (second.z_index != start_z) { // no sorting across z indices (for now) return false; } // if the neighbours are already a good match if (_sort_items_match(start, second)) // order is crucial, start first { return false; } // local cached aabb Rect2 second_AABB = second.item->global_rect_cache; // if the start and 2nd items overlap, can do no more if (start.item->global_rect_cache.intersects(second_AABB)) { return false; } // disallow sorting over copy back buffer if (second.item->copy_back_buffer) { return false; } // which neighbour to test int test_last = 2 + bdata.settings_item_reordering_lookahead; for (int test = 2; test < test_last; test++) { int test_sort_item_id = p_start + test; // if we've got to the end of the list, can't sort any more, give up if (test_sort_item_id >= bdata.sort_items.size()) { return false; } BSortItem *test_sort_item = &bdata.sort_items[test_sort_item_id]; // across z indices? if (test_sort_item->z_index != start_z) { return false; } RasterizerCanvas::Item *test_item = test_sort_item->item; // if the test item overlaps the second item, we can't swap, AT ALL // because swapping an item OVER this one would cause artefacts if (second_AABB.intersects(test_item->global_rect_cache)) { return false; } // do they match? if (!_sort_items_match(start, *test_sort_item)) // order is crucial, start first { continue; } // we can only swap if there are no AABB overlaps with sandwiched neighbours bool ok = true; // start from 2, no need to check 1 as the second has already been checked against this item // in the intersection test above for (int sn = 2; sn < test; sn++) { BSortItem *sandwich_neighbour = &bdata.sort_items[p_start + sn]; if (test_item->global_rect_cache.intersects(sandwich_neighbour->item->global_rect_cache)) { ok = false; break; } } if (!ok) { continue; } // it is ok to exchange them! BSortItem temp; temp.assign(second); second.assign(*test_sort_item); test_sort_item->assign(temp); return true; } // for test return false; } PREAMBLE(void)::_software_transform_vertex(BatchVector2 &r_v, const Transform2D &p_tr) const { Vector2 vc(r_v.x, r_v.y); vc = p_tr.xform(vc); r_v.set(vc); } PREAMBLE(void)::_software_transform_vertex(Vector2 &r_v, const Transform2D &p_tr) const { r_v = p_tr.xform(r_v); } PREAMBLE(void)::_translate_batches_to_vertex_colored_FVF() { // zeros the size and sets up how big each unit is bdata.unit_vertices.prepare(sizeof(BatchVertexColored)); const BatchColor *source_vertex_colors = &bdata.vertex_colors[0]; DEV_ASSERT(bdata.vertex_colors.size() == bdata.vertices.size()); int num_verts = bdata.vertices.size(); for (int n = 0; n < num_verts; n++) { const BatchVertex &bv = bdata.vertices[n]; BatchVertexColored *cv = (BatchVertexColored *)bdata.unit_vertices.request(); cv->pos = bv.pos; cv->uv = bv.uv; cv->col = *source_vertex_colors++; } } // Translation always involved adding color to the FVF, which enables // joining of batches that have different colors. // There is a trade off. Non colored verts are smaller so work faster, but // there comes a point where it is better to just use colored verts to avoid lots of // batches. // In addition this can optionally add light angles to the FVF, necessary for normal mapping. T_PREAMBLE template void C_PREAMBLE::_translate_batches_to_larger_FVF(uint32_t p_sequence_batch_type_flags) { bool include_poly_color = false; // we ONLY want to include the color verts in translation when using polys, // as rects do not write vertex colors, only colors per batch. if (p_sequence_batch_type_flags & RasterizerStorageCommon::BTF_POLY) { include_poly_color = INCLUDE_LIGHT_ANGLES | INCLUDE_MODULATE | INCLUDE_LARGE; } // zeros the size and sets up how big each unit is bdata.unit_vertices.prepare(sizeof(BATCH_VERTEX_TYPE)); bdata.batches_temp.reset(); // As the vertices_colored and batches_temp are 'mirrors' of the non-colored version, // the sizes should be equal, and allocations should never fail. Hence the use of debug // asserts to check program flow, these should not occur at runtime unless the allocation // code has been altered. DEV_ASSERT(bdata.unit_vertices.max_size() == bdata.vertices.max_size()); DEV_ASSERT(bdata.batches_temp.max_size() == bdata.batches.max_size()); Color curr_col(-1.0f, -1.0f, -1.0f, -1.0f); Batch *dest_batch = nullptr; const BatchColor *source_vertex_colors = &bdata.vertex_colors[0]; const float *source_light_angles = &bdata.light_angles[0]; const BatchColor *source_vertex_modulates = &bdata.vertex_modulates[0]; const BatchTransform *source_vertex_transforms = &bdata.vertex_transforms[0]; // translate the batches into vertex colored batches for (int n = 0; n < bdata.batches.size(); n++) { const Batch &source_batch = bdata.batches[n]; // does source batch use light angles? const BatchTex &btex = bdata.batch_textures[source_batch.batch_texture_id]; bool source_batch_uses_light_angles = btex.RID_normal != RID(); bool needs_new_batch = true; if (dest_batch) { if (dest_batch->type == source_batch.type) { if (source_batch.type == RasterizerStorageCommon::BT_RECT) { if (dest_batch->batch_texture_id == source_batch.batch_texture_id) { // add to previous batch dest_batch->num_commands += source_batch.num_commands; needs_new_batch = false; // create the colored verts (only if not default) int first_vert = source_batch.first_vert; int num_verts = source_batch.get_num_verts(); int end_vert = first_vert + num_verts; for (int v = first_vert; v < end_vert; v++) { RAST_DEV_DEBUG_ASSERT(bdata.vertices.size()); const BatchVertex &bv = bdata.vertices[v]; BATCH_VERTEX_TYPE *cv = (BATCH_VERTEX_TYPE *)bdata.unit_vertices.request(); RAST_DEBUG_ASSERT(cv); cv->pos = bv.pos; cv->uv = bv.uv; cv->col = source_batch.color; if (INCLUDE_LIGHT_ANGLES) { RAST_DEV_DEBUG_ASSERT(bdata.light_angles.size()); // this is required to allow compilation with non light angle vertex. // it should be compiled out. BatchVertexLightAngled *lv = (BatchVertexLightAngled *)cv; if (source_batch_uses_light_angles) { lv->light_angle = *source_light_angles++; } else { lv->light_angle = 0.0f; // dummy, unused in vertex shader (could possibly be left uninitialized, but probably bad idea) } } // if including light angles if (INCLUDE_MODULATE) { RAST_DEV_DEBUG_ASSERT(bdata.vertex_modulates.size()); BatchVertexModulated *mv = (BatchVertexModulated *)cv; mv->modulate = *source_vertex_modulates++; } // including modulate if (INCLUDE_LARGE) { RAST_DEV_DEBUG_ASSERT(bdata.vertex_transforms.size()); BatchVertexLarge *lv = (BatchVertexLarge *)cv; lv->transform = *source_vertex_transforms++; } // if including large } } // textures match } else { // default // we can still join, but only under special circumstances // does this ever happen? not sure at this stage, but left for future expansion uint32_t source_last_command = source_batch.first_command + source_batch.num_commands; if (source_last_command == dest_batch->first_command) { dest_batch->num_commands += source_batch.num_commands; needs_new_batch = false; } // if the commands line up exactly } } // if both batches are the same type } // if dest batch is valid if (needs_new_batch) { dest_batch = bdata.batches_temp.request(); RAST_DEBUG_ASSERT(dest_batch); *dest_batch = source_batch; // create the colored verts (only if not default) if (source_batch.type != RasterizerStorageCommon::BT_DEFAULT) { int first_vert = source_batch.first_vert; int num_verts = source_batch.get_num_verts(); int end_vert = first_vert + num_verts; for (int v = first_vert; v < end_vert; v++) { RAST_DEV_DEBUG_ASSERT(bdata.vertices.size()); const BatchVertex &bv = bdata.vertices[v]; BATCH_VERTEX_TYPE *cv = (BATCH_VERTEX_TYPE *)bdata.unit_vertices.request(); RAST_DEBUG_ASSERT(cv); cv->pos = bv.pos; cv->uv = bv.uv; // polys are special, they can have per vertex colors if (!include_poly_color) { cv->col = source_batch.color; } else { RAST_DEV_DEBUG_ASSERT(bdata.vertex_colors.size()); cv->col = *source_vertex_colors++; } if (INCLUDE_LIGHT_ANGLES) { RAST_DEV_DEBUG_ASSERT(bdata.light_angles.size()); // this is required to allow compilation with non light angle vertex. // it should be compiled out. BatchVertexLightAngled *lv = (BatchVertexLightAngled *)cv; if (source_batch_uses_light_angles) { lv->light_angle = *source_light_angles++; } else { lv->light_angle = 0.0f; // dummy, unused in vertex shader (could possibly be left uninitialized, but probably bad idea) } } // if using light angles if (INCLUDE_MODULATE) { RAST_DEV_DEBUG_ASSERT(bdata.vertex_modulates.size()); BatchVertexModulated *mv = (BatchVertexModulated *)cv; mv->modulate = *source_vertex_modulates++; } // including modulate if (INCLUDE_LARGE) { RAST_DEV_DEBUG_ASSERT(bdata.vertex_transforms.size()); BatchVertexLarge *lv = (BatchVertexLarge *)cv; lv->transform = *source_vertex_transforms++; } // if including large } } } } // copy the temporary batches to the master batch list (this could be avoided but it makes the code cleaner) bdata.batches.copy_from(bdata.batches_temp); } PREAMBLE(bool)::_disallow_item_join_if_batch_types_too_different(RenderItemState &r_ris, uint32_t btf_allowed) { r_ris.joined_item_batch_type_flags_curr |= btf_allowed; bool disallow = false; if (r_ris.joined_item_batch_type_flags_prev & (~btf_allowed)) { disallow = true; } return disallow; } PREAMBLE(bool)::_detect_item_batch_break(RenderItemState &r_ris, RasterizerCanvas::Item *p_ci, bool &r_batch_break) { int command_count = p_ci->commands.size(); // Any item that contains commands that are default // (i.e. not handled by software transform and the batching renderer) should not be joined. // ALSO batched types that differ in what the vertex format is needed to be should not be // joined. // In order to work this out, it does a lookahead through the commands, // which could potentially be very expensive. As such it makes sense to put a limit on this // to some small number, which will catch nearly all cases which need joining, // but not be overly expensive in the case of items with large numbers of commands. // It is hard to know what this number should be, empirically, // and this has not been fully investigated. It works to join single sprite items when set to 1 or above. // Note that there is a cost to increasing this because it has to look in advance through // the commands. // On the other hand joining items where possible will usually be better up to a certain // number where the cost of software transform is higher than separate drawcalls with hardware // transform. // if there are more than this number of commands in the item, we // don't allow joining (separate state changes, and hardware transform) // This is set to quite a conservative (low) number until investigated properly. // const int MAX_JOIN_ITEM_COMMANDS = 16; r_ris.joined_item_batch_type_flags_curr = 0; if (command_count > bdata.settings_max_join_item_commands) { return true; } else { RasterizerCanvas::Item::Command *const *commands = p_ci->commands.ptr(); // run through the commands looking for one that could prevent joining for (int command_num = 0; command_num < command_count; command_num++) { RasterizerCanvas::Item::Command *command = commands[command_num]; RAST_DEBUG_ASSERT(command); switch (command->type) { default: { //r_batch_break = true; return true; } break; case RasterizerCanvas::Item::Command::TYPE_LINE: { // special case, only batches certain lines RasterizerCanvas::Item::CommandLine *line = static_cast(command); if (line->width > 1) { //r_batch_break = true; return true; } if (_disallow_item_join_if_batch_types_too_different(r_ris, RasterizerStorageCommon::BTF_LINE | RasterizerStorageCommon::BTF_LINE_AA)) { return true; } } break; case RasterizerCanvas::Item::Command::TYPE_POLYGON: { // only allow polygons to join if they aren't skeleton RasterizerCanvas::Item::CommandPolygon *poly = static_cast(command); #ifdef GLES_OVER_GL // anti aliasing not accelerated if (poly->antialiased) { return true; } #endif // light angles not yet implemented, treat as default if (poly->normal_map != RID()) { return true; } if (_disallow_item_join_if_batch_types_too_different(r_ris, RasterizerStorageCommon::BTF_POLY)) { //r_batch_break = true; return true; } } break; case RasterizerCanvas::Item::Command::TYPE_RECT: { if (_disallow_item_join_if_batch_types_too_different(r_ris, RasterizerStorageCommon::BTF_RECT)) { return true; } } break; case RasterizerCanvas::Item::Command::TYPE_MULTIRECT: { if (_disallow_item_join_if_batch_types_too_different(r_ris, RasterizerStorageCommon::BTF_RECT)) { return true; } } break; case RasterizerCanvas::Item::Command::TYPE_NINEPATCH: { // do not handle tiled ninepatches, these can't be batched and need to use legacy method RasterizerCanvas::Item::CommandNinePatch *np = static_cast(command); if ((np->axis_x != RenderingServer::NINE_PATCH_STRETCH) || (np->axis_y != RenderingServer::NINE_PATCH_STRETCH)) { return true; } if (_disallow_item_join_if_batch_types_too_different(r_ris, RasterizerStorageCommon::BTF_RECT)) { return true; } } break; case RasterizerCanvas::Item::Command::TYPE_TRANSFORM: { // compatible with all types } break; } // switch } // for through commands } // else // special case, back buffer copy, so don't join if (p_ci->copy_back_buffer) { return true; } return false; } #undef PREAMBLE #undef T_PREAMBLE #undef C_PREAMBLE #endif // RASTERIZER_CANVAS_BATCHER_H