Common Source Code Project for Qt (a.k.a for FM-7).
修订版 | 825bf2ca6d1fdd3b866e72eaf7e6db32a4a704c4 (tree) |
---|---|
时间 | 2019-01-17 19:40:18 |
作者 | K.Ohta <whatisthis.sowhat@gmai...> |
Commiter | K.Ohta |
[COMMON] Fix unaligned SIMD variables.Fix crash built with "-msse2" at Win32.
[BUILD][Win32] Adjust optimize parameter for MinGW/Win32.
@@ -29,14 +29,14 @@ case ${CSP_DEBUG} in | ||
29 | 29 | MAKEFLAGS_BASE2="-ggdb ${ARCH_FLAGS} ${MAKEFLAGS_BASE} ${ADDITIONAL_CFLAGS} -DNDEBUG" |
30 | 30 | ;; |
31 | 31 | "No" | "no" | "NO" | * ) |
32 | - MAKEFLAGS_BASE2="${MAKEFLAGS_BASE} -O3 \ | |
32 | + MAKEFLAGS_BASE2="${MAKEFLAGS_BASE} -O2 \ | |
33 | 33 | ${ARCH_FLAGS} \ |
34 | - -ftree-vectorize \ | |
35 | - -ftree-loop-optimize \ | |
36 | - -floop-nest-optimize \ | |
37 | 34 | -std=c++11 \ |
38 | 35 | ${ADDITIONAL_CFLAGS} \ |
39 | 36 | -DNDEBUG " |
37 | +# -ftree-vectorize \ | |
38 | +# -ftree-loop-optimize \ | |
39 | +# -floop-nest-optimize \ | |
40 | 40 | ;; |
41 | 41 | esac |
42 | 42 |
@@ -537,10 +537,8 @@ uint8_t DLL_PREFIX A_OF_COLOR(scrntype_t c) | ||
537 | 537 | void DLL_PREFIX PrepareBitTransTableUint16(_bit_trans_table_t *tbl, uint16_t on_val, uint16_t off_val) |
538 | 538 | { |
539 | 539 | if(tbl == NULL) return; |
540 | -__DECL_VECTORIZED_LOOP | |
541 | 540 | for(uint16_t i = 0; i < 256; i++) { |
542 | 541 | uint16_t n = i; |
543 | -__DECL_VECTORIZED_LOOP | |
544 | 542 | for(int j = 0; j < 8; j++) { |
545 | 543 | tbl->plane_table[i].w[j] = ((n & 0x80) == 0) ? off_val : on_val; |
546 | 544 | n <<= 1; |
@@ -554,10 +552,8 @@ __DECL_VECTORIZED_LOOP | ||
554 | 552 | void DLL_PREFIX PrepareBitTransTableScrnType(_bit_trans_table_scrn_t *tbl, scrntype_t on_val, scrntype_t off_val) |
555 | 553 | { |
556 | 554 | if(tbl == NULL) return; |
557 | -__DECL_VECTORIZED_LOOP | |
558 | 555 | for(uint16_t i = 0; i < 256; i++) { |
559 | 556 | uint16_t n = i; |
560 | -__DECL_VECTORIZED_LOOP | |
561 | 557 | for(int j = 0; j < 8; j++) { |
562 | 558 | tbl->plane_table[i].w[j] = ((n & 0x80) == 0) ? off_val : on_val; |
563 | 559 | n <<= 1; |
@@ -569,10 +565,8 @@ __DECL_VECTORIZED_LOOP | ||
569 | 565 | void DLL_PREFIX PrepareReverseBitTransTableUint16(_bit_trans_table_t *tbl, uint16_t on_val, uint16_t off_val) |
570 | 566 | { |
571 | 567 | if(tbl == NULL) return; |
572 | -__DECL_VECTORIZED_LOOP | |
573 | 568 | for(uint16_t i = 0; i < 256; i++) { |
574 | 569 | uint16_t n = i; |
575 | -__DECL_VECTORIZED_LOOP | |
576 | 570 | for(int j = 0; j < 8; j++) { |
577 | 571 | tbl->plane_table[i].w[j] = ((n & 0x01) == 0) ? off_val : on_val; |
578 | 572 | n >>= 1; |
@@ -583,10 +577,8 @@ __DECL_VECTORIZED_LOOP | ||
583 | 577 | void DLL_PREFIX PrepareReverseBitTransTableScrnType(_bit_trans_table_scrn_t *tbl, scrntype_t on_val, scrntype_t off_val) |
584 | 578 | { |
585 | 579 | if(tbl == NULL) return; |
586 | -__DECL_VECTORIZED_LOOP | |
587 | 580 | for(uint16_t i = 0; i < 256; i++) { |
588 | 581 | uint16_t n = i; |
589 | -__DECL_VECTORIZED_LOOP | |
590 | 582 | for(int j = 0; j < 8; j++) { |
591 | 583 | tbl->plane_table[i].w[j] = ((n & 0x01) == 0) ? off_val : on_val; |
592 | 584 | n >>= 1; |
@@ -598,9 +590,9 @@ __DECL_VECTORIZED_LOOP | ||
598 | 590 | void DLL_PREFIX ConvertByteToPackedPixelByColorTable2(uint8_t *src, scrntype_t* dst, int bytes, _bit_trans_table_scrn_t *tbl, scrntype_t *on_color_table, scrntype_t* off_color_table) |
599 | 591 | { |
600 | 592 | |
601 | - scrntype_vec8_t tmpd; | |
602 | - scrntype_vec8_t tmpdd; | |
603 | - scrntype_vec8_t colors; | |
593 | + __DECL_ALIGNED(32) scrntype_vec8_t tmpd; | |
594 | + __DECL_ALIGNED(32) scrntype_vec8_t tmpdd; | |
595 | + __DECL_ALIGNED(32) scrntype_vec8_t colors; | |
604 | 596 | scrntype_vec8_t* vt = (scrntype_vec8_t*)__builtin_assume_aligned(&(tbl->plane_table[0]), sizeof(scrntype_vec8_t)); |
605 | 597 | |
606 | 598 | uintptr_t disalign = (uintptr_t)dst; |
@@ -664,10 +656,10 @@ __DECL_VECTORIZED_LOOP | ||
664 | 656 | void DLL_PREFIX ConvertByteToSparceUint16(uint8_t *src, uint16_t* dst, int bytes, _bit_trans_table_t *tbl, uint16_t mask) |
665 | 657 | { |
666 | 658 | |
667 | - uint16_vec8_t tmpd; | |
659 | + __DECL_ALIGNED(16) uint16_vec8_t tmpd; | |
668 | 660 | uint16_vec8_t* vt = (uint16_vec8_t*)__builtin_assume_aligned(&(tbl->plane_table[0]), sizeof(uint16_vec8_t)); |
669 | 661 | |
670 | - uint16_vec8_t __masks; | |
662 | + __DECL_ALIGNED(16) uint16_vec8_t __masks; | |
671 | 663 | |
672 | 664 | __DECL_VECTORIZED_LOOP |
673 | 665 | for(int i = 0; i < 8; i++) { |
@@ -706,11 +698,11 @@ __DECL_VECTORIZED_LOOP | ||
706 | 698 | void DLL_PREFIX ConvertByteToSparceUint8(uint8_t *src, uint16_t* dst, int bytes, _bit_trans_table_t *tbl, uint16_t mask) |
707 | 699 | { |
708 | 700 | |
709 | - uint16_vec8_t tmpd; | |
701 | + __DECL_ALIGNED(16) uint16_vec8_t tmpd; | |
710 | 702 | uint16_vec8_t* vt = (uint16_vec8_t*)__builtin_assume_aligned(&(tbl->plane_table[0]), sizeof(uint16_vec8_t)); |
711 | 703 | |
712 | - uint16_vec8_t __masks; | |
713 | - uint8_vec8_t tmpdd; | |
704 | + __DECL_ALIGNED(16) uint16_vec8_t __masks; | |
705 | + __DECL_ALIGNED(16) uint8_vec8_t tmpdd; | |
714 | 706 | |
715 | 707 | __DECL_VECTORIZED_LOOP |
716 | 708 | for(int i = 0; i < 8; i++) { |
@@ -751,8 +743,8 @@ __DECL_VECTORIZED_LOOP | ||
751 | 743 | void DLL_PREFIX ConvertByteToPackedPixelByColorTable(uint8_t *src, scrntype_t* dst, int bytes, _bit_trans_table_t *tbl, scrntype_t *on_color_table, scrntype_t* off_color_table) |
752 | 744 | { |
753 | 745 | |
754 | - uint16_vec8_t tmpd; | |
755 | - scrntype_vec8_t tmpdd; | |
746 | + __DECL_ALIGNED(16) uint16_vec8_t tmpd; | |
747 | + __DECL_ALIGNED(32) scrntype_vec8_t tmpdd; | |
756 | 748 | uint16_vec8_t* vt = (uint16_vec8_t*)__builtin_assume_aligned(&(tbl->plane_table[0]), sizeof(uint16_vec8_t)); |
757 | 749 | |
758 | 750 | uintptr_t disalign = (uintptr_t)dst; |
@@ -831,8 +823,8 @@ __DECL_VECTORIZED_LOOP | ||
831 | 823 | uint8_t r, g, b; |
832 | 824 | int shift = src->shift; |
833 | 825 | const bool is_render[3] = { src->is_render[0], src->is_render[1], src->is_render[2] }; |
834 | - uint16_vec8_t tmpd; | |
835 | - scrntype_vec8_t tmp_dd; | |
826 | + __DECL_ALIGNED(16) uint16_vec8_t tmpd; | |
827 | + __DECL_ALIGNED(32) scrntype_vec8_t tmp_dd; | |
836 | 828 | scrntype_vec8_t* vdp = (scrntype_vec8_t*)__builtin_assume_aligned(dst, sizeof(scrntype_vec8_t)); |
837 | 829 | |
838 | 830 | x = src->begin_pos; |
@@ -860,7 +852,7 @@ __DECL_VECTORIZED_LOOP | ||
860 | 852 | #else // 24bit |
861 | 853 | static const int shift_factor = 3; |
862 | 854 | #endif |
863 | - scrntype_vec8_t sline; | |
855 | + __DECL_ALIGNED(32) scrntype_vec8_t sline; | |
864 | 856 | scrntype_vec8_t* vdp2 = (scrntype_vec8_t*)__builtin_assume_aligned(dst2, sizeof(scrntype_vec8_t)); |
865 | 857 | __DECL_VECTORIZED_LOOP |
866 | 858 | for(int i = 0; i < 8; i++) { |
@@ -935,8 +927,8 @@ __DECL_VECTORIZED_LOOP | ||
935 | 927 | uint8_t r, g, b, n; |
936 | 928 | int shift = src->shift; |
937 | 929 | const bool is_render[4] = { src->is_render[0], src->is_render[1], src->is_render[2], src->is_render[3] }; |
938 | - uint16_vec8_t tmpd; | |
939 | - scrntype_vec8_t tmp_dd; | |
930 | + __DECL_ALIGNED(16) uint16_vec8_t tmpd; | |
931 | + __DECL_ALIGNED(32) scrntype_vec8_t tmp_dd; | |
940 | 932 | scrntype_vec8_t* vdp = (scrntype_vec8_t*)__builtin_assume_aligned(dst, sizeof(scrntype_vec8_t)); |
941 | 933 | |
942 | 934 | x = src->begin_pos; |
@@ -966,7 +958,7 @@ __DECL_VECTORIZED_LOOP | ||
966 | 958 | #else // 24bit |
967 | 959 | static const int shift_factor = 3; |
968 | 960 | #endif |
969 | - scrntype_vec8_t sline; | |
961 | + __DECL_ALIGNED(32) scrntype_vec8_t sline; | |
970 | 962 | scrntype_vec8_t* vdp2 = (scrntype_vec8_t*)__builtin_assume_aligned(dst2, sizeof(scrntype_vec8_t)); |
971 | 963 | __DECL_VECTORIZED_LOOP |
972 | 964 | for(int i = 0; i < 8; i++) { |
@@ -1035,8 +1027,8 @@ __DECL_VECTORIZED_LOOP | ||
1035 | 1027 | uint8_t d[16]; |
1036 | 1028 | int shift = src->shift; |
1037 | 1029 | const bool is_render[4] = { src->is_render[0], src->is_render[1], src->is_render[2], src->is_render[3] }; |
1038 | - uint16_vec8_t tmpd; | |
1039 | - scrntype_vec8_t tmp_dd; | |
1030 | + __DECL_ALIGNED(16) uint16_vec8_t tmpd; | |
1031 | + __DECL_ALIGNED(32) scrntype_vec8_t tmp_dd; | |
1040 | 1032 | scrntype_vec8_t* vdp = (scrntype_vec8_t*)__builtin_assume_aligned(dst, sizeof(scrntype_vec8_t)); |
1041 | 1033 | |
1042 | 1034 | x = src->begin_pos; |
@@ -1065,7 +1057,7 @@ __DECL_VECTORIZED_LOOP | ||
1065 | 1057 | #else // 24bit |
1066 | 1058 | static const int shift_factor = 3; |
1067 | 1059 | #endif |
1068 | - scrntype_vec8_t sline; | |
1060 | + __DECL_ALIGNED(32) scrntype_vec8_t sline; | |
1069 | 1061 | scrntype_vec8_t* vdp2 = (scrntype_vec8_t*)__builtin_assume_aligned(dst2, sizeof(scrntype_vec8_t)); |
1070 | 1062 | __DECL_VECTORIZED_LOOP |
1071 | 1063 | for(int i = 0; i < 8; i++) { |
@@ -1104,7 +1096,7 @@ void DLL_PREFIX Convert2NColorsToByte_Line(_render_command_data_t *src, uint8_t | ||
1104 | 1096 | |
1105 | 1097 | uint8_t* srcp[8]; |
1106 | 1098 | __DECL_ALIGNED(32) uint32_t offset[8] = {0}; |
1107 | - uint16_vec8_t dat; | |
1099 | + __DECL_ALIGNED(16) uint16_vec8_t dat; | |
1108 | 1100 | uint16_vec8_t* bp[8] ; |
1109 | 1101 | |
1110 | 1102 | __DECL_VECTORIZED_LOOP |
@@ -1152,7 +1144,7 @@ void DLL_PREFIX Convert2NColorsToByte_LineZoom2(_render_command_data_t *src, uin | ||
1152 | 1144 | |
1153 | 1145 | uint8_t* srcp[8]; |
1154 | 1146 | __DECL_ALIGNED(32) uint32_t offset[8] = {0}; |
1155 | - uint16_vec8_t dat; | |
1147 | + __DECL_ALIGNED(16) uint16_vec8_t dat; | |
1156 | 1148 | uint16_vec8_t* bp[8] ; |
1157 | 1149 | |
1158 | 1150 | __DECL_VECTORIZED_LOOP |
@@ -1200,10 +1192,10 @@ void DLL_PREFIX Convert8ColorsToByte_Line(_render_command_data_t *src, uint8_t * | ||
1200 | 1192 | uint8_t *gp = &(src->data[2][src->baseaddress[2]]); |
1201 | 1193 | __DECL_ALIGNED(16) uint32_t offset[4] = {0}; |
1202 | 1194 | |
1203 | - uint16_vec8_t rdat; | |
1204 | - uint16_vec8_t gdat; | |
1205 | - uint16_vec8_t bdat; | |
1206 | - uint16_vec8_t tmpd; | |
1195 | + __DECL_ALIGNED(16) uint16_vec8_t rdat; | |
1196 | + __DECL_ALIGNED(16) uint16_vec8_t gdat; | |
1197 | + __DECL_ALIGNED(16) uint16_vec8_t bdat; | |
1198 | + __DECL_ALIGNED(16) uint16_vec8_t tmpd; | |
1207 | 1199 | |
1208 | 1200 | uint16_vec8_t* bpb = (uint16_vec8_t*)__builtin_assume_aligned(&(src->bit_trans_table[0]->plane_table[0]), sizeof(uint16_vec8_t)); |
1209 | 1201 | uint16_vec8_t* bpr = (uint16_vec8_t*)__builtin_assume_aligned(&(src->bit_trans_table[1]->plane_table[0]), sizeof(uint16_vec8_t)); |
@@ -1082,8 +1082,8 @@ typedef struct { | ||
1082 | 1082 | |
1083 | 1083 | inline scrntype_vec8_t ConvertByteToMonochromePackedPixel(uint8_t src, _bit_trans_table_t *tbl,scrntype_t on_val, scrntype_t off_val) |
1084 | 1084 | { |
1085 | - uint16_vec8_t tmpd; | |
1086 | - scrntype_vec8_t tmpdd; | |
1085 | + __DECL_ALIGNED(16) uint16_vec8_t tmpd; | |
1086 | + __DECL_ALIGNED(32) scrntype_vec8_t tmpdd; | |
1087 | 1087 | _bit_trans_table_t* vt = (_bit_trans_table_t*)__builtin_assume_aligned(tbl, sizeof(uint16_vec8_t)); |
1088 | 1088 | |
1089 | 1089 | tmpd.v = vt->plane_table[src].v; |
@@ -1104,7 +1104,7 @@ void DLL_PREFIX ConvertByteToSparceUint8(uint8_t *src, uint16_t* dst, int bytes, | ||
1104 | 1104 | // Table must be (ON_VAL_COLOR : OFF_VAL_COLOR)[256]. |
1105 | 1105 | inline scrntype_vec8_t ConvertByteToPackedPixel_PixelTbl(uint8_t src, _bit_trans_table_scrn_t *tbl) |
1106 | 1106 | { |
1107 | - scrntype_vec8_t tmpdd; | |
1107 | + __DECL_ALIGNED(32) scrntype_vec8_t tmpdd; | |
1108 | 1108 | _bit_trans_table_scrn_t* vt = (_bit_trans_table_scrn_t*)__builtin_assume_aligned(tbl, sizeof(uint16_vec8_t)); |
1109 | 1109 | |
1110 | 1110 | tmpdd.v = vt->plane_table[src].v; |
@@ -1114,8 +1114,8 @@ inline scrntype_vec8_t ConvertByteToPackedPixel_PixelTbl(uint8_t src, _bit_trans | ||
1114 | 1114 | // Table must be (ON_VAL_COLOR : OFF_VAL_COLOR)[256]. |
1115 | 1115 | inline scrntype_vec16_t ConvertByteToDoublePackedPixel_PixelTbl(uint8_t src, _bit_trans_table_scrn_t *tbl) |
1116 | 1116 | { |
1117 | - scrntype_vec16_t tmpdd; | |
1118 | - scrntype_vec8_t tmpd; | |
1117 | + __DECL_ALIGNED(32) scrntype_vec16_t tmpdd; | |
1118 | + __DECL_ALIGNED(32) scrntype_vec8_t tmpd; | |
1119 | 1119 | _bit_trans_table_scrn_t* vt = (_bit_trans_table_scrn_t*)__builtin_assume_aligned(tbl, sizeof(uint16_vec8_t)); |
1120 | 1120 | tmpd.v = vt->plane_table[src].v; |
1121 | 1121 | int j = 0; |
@@ -1131,7 +1131,7 @@ __DECL_VECTORIZED_LOOP | ||
1131 | 1131 | // Table must be initialize ON_COLOR : OFF_COLOR |
1132 | 1132 | inline void ConvertByteToDoubleMonochromeUint8(uint8_t src, uint8_t* dst, _bit_trans_table_t* tbl) |
1133 | 1133 | { |
1134 | - uint16_vec8_t tmpd; | |
1134 | + __DECL_ALIGNED(16) uint16_vec8_t tmpd; | |
1135 | 1135 | uint16_vec8_t* vt = (uint16_vec8_t*)__builtin_assume_aligned(&(tbl->plane_table[0]), sizeof(uint16_vec8_t)); |
1136 | 1136 | |
1137 | 1137 | __DECL_ALIGNED(16) uint8_t d[16]; |
@@ -1151,7 +1151,7 @@ __DECL_VECTORIZED_LOOP | ||
1151 | 1151 | |
1152 | 1152 | inline void ConvertByteToMonochromeUint8(uint8_t src, uint8_t* dst, _bit_trans_table_t* tbl) |
1153 | 1153 | { |
1154 | - uint16_vec8_t tmpd; | |
1154 | + __DECL_ALIGNED(16) uint16_vec8_t tmpd; | |
1155 | 1155 | uint16_vec8_t* vt = (uint16_vec8_t*)__builtin_assume_aligned(&(tbl->plane_table[0]), sizeof(uint16_vec8_t)); |
1156 | 1156 | |
1157 | 1157 | tmpd = vt[src]; |
@@ -1163,7 +1163,7 @@ __DECL_VECTORIZED_LOOP | ||
1163 | 1163 | |
1164 | 1164 | inline void ConvertRGBTo8ColorsUint8(uint8_t r, uint8_t g, uint8_t b, uint8_t* dst, _bit_trans_table_t* rtbl, _bit_trans_table_t* gtbl, _bit_trans_table_t* btbl, int shift) |
1165 | 1165 | { |
1166 | - uint16_vec8_t tmpd; | |
1166 | + __DECL_ALIGNED(16) uint16_vec8_t tmpd; | |
1167 | 1167 | uint16_vec8_t* rvt = (uint16_vec8_t*)__builtin_assume_aligned(&(rtbl->plane_table[0]), sizeof(uint16_vec8_t)); |
1168 | 1168 | uint16_vec8_t* gvt = (uint16_vec8_t*)__builtin_assume_aligned(&(gtbl->plane_table[0]), sizeof(uint16_vec8_t)); |
1169 | 1169 | uint16_vec8_t* bvt = (uint16_vec8_t*)__builtin_assume_aligned(&(btbl->plane_table[0]), sizeof(uint16_vec8_t)); |
@@ -1180,7 +1180,7 @@ __DECL_VECTORIZED_LOOP | ||
1180 | 1180 | |
1181 | 1181 | inline void ConvertRGBTo8ColorsUint8_Zoom2Left(uint8_t r, uint8_t g, uint8_t b, uint8_t* dst, _bit_trans_table_t* rtbl, _bit_trans_table_t* gtbl, _bit_trans_table_t* btbl, int shift) |
1182 | 1182 | { |
1183 | - uint16_vec8_t tmpd; | |
1183 | + __DECL_ALIGNED(16) uint16_vec8_t tmpd; | |
1184 | 1184 | uint16_vec8_t* rvt = (uint16_vec8_t*)__builtin_assume_aligned(&(rtbl->plane_table[0]), sizeof(uint16_vec8_t)); |
1185 | 1185 | uint16_vec8_t* gvt = (uint16_vec8_t*)__builtin_assume_aligned(&(gtbl->plane_table[0]), sizeof(uint16_vec8_t)); |
1186 | 1186 | uint16_vec8_t* bvt = (uint16_vec8_t*)__builtin_assume_aligned(&(btbl->plane_table[0]), sizeof(uint16_vec8_t)); |
@@ -1198,7 +1198,7 @@ __DECL_VECTORIZED_LOOP | ||
1198 | 1198 | |
1199 | 1199 | inline void ConvertRGBTo8ColorsUint8_Zoom2Right(uint8_t r, uint8_t g, uint8_t b, uint8_t* dst, _bit_trans_table_t* rtbl, _bit_trans_table_t* gtbl, _bit_trans_table_t* btbl, int shift) |
1200 | 1200 | { |
1201 | - uint16_vec8_t tmpd; | |
1201 | + __DECL_ALIGNED(16) uint16_vec8_t tmpd; | |
1202 | 1202 | uint16_vec8_t* rvt = (uint16_vec8_t*)__builtin_assume_aligned(&(rtbl->plane_table[0]), sizeof(uint16_vec8_t)); |
1203 | 1203 | uint16_vec8_t* gvt = (uint16_vec8_t*)__builtin_assume_aligned(&(gtbl->plane_table[0]), sizeof(uint16_vec8_t)); |
1204 | 1204 | uint16_vec8_t* bvt = (uint16_vec8_t*)__builtin_assume_aligned(&(btbl->plane_table[0]), sizeof(uint16_vec8_t)); |
@@ -1216,7 +1216,7 @@ __DECL_VECTORIZED_LOOP | ||
1216 | 1216 | |
1217 | 1217 | inline void ConvertRGBTo8ColorsUint8_Zoom2Double(uint8_t r, uint8_t g, uint8_t b, uint8_t* dst, _bit_trans_table_t* rtbl, _bit_trans_table_t* gtbl, _bit_trans_table_t* btbl, int shift) |
1218 | 1218 | { |
1219 | - uint16_vec8_t tmpd; | |
1219 | + __DECL_ALIGNED(16) uint16_vec8_t tmpd; | |
1220 | 1220 | uint16_vec8_t* rvt = (uint16_vec8_t*)__builtin_assume_aligned(&(rtbl->plane_table[0]), sizeof(uint16_vec8_t)); |
1221 | 1221 | uint16_vec8_t* gvt = (uint16_vec8_t*)__builtin_assume_aligned(&(gtbl->plane_table[0]), sizeof(uint16_vec8_t)); |
1222 | 1222 | uint16_vec8_t* bvt = (uint16_vec8_t*)__builtin_assume_aligned(&(btbl->plane_table[0]), sizeof(uint16_vec8_t)); |
@@ -1234,7 +1234,7 @@ __DECL_VECTORIZED_LOOP | ||
1234 | 1234 | |
1235 | 1235 | inline void ConvertByteToMonochromeUint8Cond_Zoom2(uint8_t src, uint8_t* dst, _bit_trans_table_t* tbl, uint8_t on_color, uint8_t off_color) |
1236 | 1236 | { |
1237 | - uint16_vec8_t tmpd; | |
1237 | + __DECL_ALIGNED(16) uint16_vec8_t tmpd; | |
1238 | 1238 | uint16_vec8_t* vt = (uint16_vec8_t*)__builtin_assume_aligned(&(tbl->plane_table[0]), sizeof(uint16_vec8_t)); |
1239 | 1239 | |
1240 | 1240 | __DECL_ALIGNED(16) uint8_t d[16]; |
@@ -1254,7 +1254,7 @@ __DECL_VECTORIZED_LOOP | ||
1254 | 1254 | |
1255 | 1255 | inline void ConvertByteToMonochromeUint8Cond(uint8_t src, uint8_t* dst, _bit_trans_table_t* tbl, uint8_t on_color, uint8_t off_color) |
1256 | 1256 | { |
1257 | - uint16_vec8_t tmpd; | |
1257 | + __DECL_ALIGNED(16) uint16_vec8_t tmpd; | |
1258 | 1258 | uint16_vec8_t* vt = (uint16_vec8_t*)__builtin_assume_aligned(&(tbl->plane_table[0]), sizeof(uint16_vec8_t)); |
1259 | 1259 | |
1260 | 1260 | tmpd = vt[src]; |
@@ -42,31 +42,6 @@ DISPLAY::DISPLAY(VM_TEMPLATE* parent_vm, EMU* parent_emu) : DEVICE(parent_vm, pa | ||
42 | 42 | mainio = NULL; |
43 | 43 | subcpu = NULL; |
44 | 44 | keyboard = NULL; |
45 | -#if 1 | |
46 | - PrepareBitTransTableUint16((_bit_trans_table_t*)(&(bit_trans_table_0[0][0])), 0x0080, 0x0000); | |
47 | - PrepareBitTransTableUint16((_bit_trans_table_t*)(&(bit_trans_table_1[0][0])), 0x0040, 0x0000); | |
48 | - PrepareBitTransTableUint16((_bit_trans_table_t*)(&(bit_trans_table_2[0][0])), 0x0020, 0x0000); | |
49 | - PrepareBitTransTableUint16((_bit_trans_table_t*)(&(bit_trans_table_3[0][0])), 0x0010, 0x0000); | |
50 | -#if defined(_FM77AV40) || defined(_FM77AV40EX) || defined(_FM77AV40SX) | |
51 | - PrepareBitTransTableUint16((_bit_trans_table_t*)(&(bit_trans_table_4[0][0])), 0x0008, 0x0000); | |
52 | - PrepareBitTransTableUint16((_bit_trans_table_t*)(&(bit_trans_table_5[0][0])), 0x0004, 0x0000); | |
53 | -#endif | |
54 | -#else | |
55 | - for(int i = 0; i < 256; i++) { | |
56 | - uint16_t n = (uint16_t)i; | |
57 | - for(int j = 0; j < 8; j++) { | |
58 | - bit_trans_table_0[i][j] = n & 0x80; | |
59 | - bit_trans_table_1[i][j] = ((n & 0x80) != 0) ? 0x40 : 0; | |
60 | - bit_trans_table_2[i][j] = ((n & 0x80) != 0) ? 0x20 : 0; | |
61 | - bit_trans_table_3[i][j] = ((n & 0x80) != 0) ? 0x10 : 0; | |
62 | -#if defined(_FM77AV40) || defined(_FM77AV40EX) || defined(_FM77AV40SX) | |
63 | - bit_trans_table_4[i][j] = ((n & 0x80) != 0) ? 0x08 : 0; | |
64 | - bit_trans_table_5[i][j] = ((n & 0x80) != 0) ? 0x04 : 0; | |
65 | -#endif | |
66 | - n <<= 1; | |
67 | - } | |
68 | - } | |
69 | -#endif | |
70 | 45 | displine = 0; |
71 | 46 | active_page = 0; |
72 | 47 | #if defined(USE_GREEN_DISPLAY) |
@@ -3365,6 +3340,31 @@ void DISPLAY::initialize() | ||
3365 | 3340 | { |
3366 | 3341 | int i; |
3367 | 3342 | |
3343 | +#if 1 | |
3344 | + PrepareBitTransTableUint16((_bit_trans_table_t*)(&(bit_trans_table_0[0][0])), 0x0080, 0x0000); | |
3345 | + PrepareBitTransTableUint16((_bit_trans_table_t*)(&(bit_trans_table_1[0][0])), 0x0040, 0x0000); | |
3346 | + PrepareBitTransTableUint16((_bit_trans_table_t*)(&(bit_trans_table_2[0][0])), 0x0020, 0x0000); | |
3347 | + PrepareBitTransTableUint16((_bit_trans_table_t*)(&(bit_trans_table_3[0][0])), 0x0010, 0x0000); | |
3348 | +#if defined(_FM77AV40) || defined(_FM77AV40EX) || defined(_FM77AV40SX) | |
3349 | + PrepareBitTransTableUint16((_bit_trans_table_t*)(&(bit_trans_table_4[0][0])), 0x0008, 0x0000); | |
3350 | + PrepareBitTransTableUint16((_bit_trans_table_t*)(&(bit_trans_table_5[0][0])), 0x0004, 0x0000); | |
3351 | +#endif | |
3352 | +#else | |
3353 | + for(int i = 0; i < 256; i++) { | |
3354 | + uint16_t n = (uint16_t)i; | |
3355 | + for(int j = 0; j < 8; j++) { | |
3356 | + bit_trans_table_0[i][j] = n & 0x80; | |
3357 | + bit_trans_table_1[i][j] = ((n & 0x80) != 0) ? 0x40 : 0; | |
3358 | + bit_trans_table_2[i][j] = ((n & 0x80) != 0) ? 0x20 : 0; | |
3359 | + bit_trans_table_3[i][j] = ((n & 0x80) != 0) ? 0x10 : 0; | |
3360 | +#if defined(_FM77AV40) || defined(_FM77AV40EX) || defined(_FM77AV40SX) | |
3361 | + bit_trans_table_4[i][j] = ((n & 0x80) != 0) ? 0x08 : 0; | |
3362 | + bit_trans_table_5[i][j] = ((n & 0x80) != 0) ? 0x04 : 0; | |
3363 | +#endif | |
3364 | + n <<= 1; | |
3365 | + } | |
3366 | + } | |
3367 | +#endif | |
3368 | 3368 | memset(io_w_latch, 0xff, sizeof(io_w_latch)); |
3369 | 3369 | screen_update_flag = true; |
3370 | 3370 | memset(gvram, 0x00, sizeof(gvram)); |
@@ -671,8 +671,8 @@ void DISPLAY::CopyDrawnData(scrntype_t* src, scrntype_t* dst, int width, bool sc | ||
671 | 671 | #endif |
672 | 672 | scrntype_vec8_t* vsrc = (scrntype_vec8_t*)__builtin_assume_aligned(src, sizeof(scrntype_vec8_t)); |
673 | 673 | scrntype_vec8_t* vdst = (scrntype_vec8_t*)__builtin_assume_aligned(dst, sizeof(scrntype_vec8_t)); |
674 | - scrntype_vec8_t tmp_dd; | |
675 | - scrntype_vec8_t sline; | |
674 | + __DECL_ALIGNED(32) scrntype_vec8_t tmp_dd; | |
675 | + __DECL_ALIGNED(32) scrntype_vec8_t sline; | |
676 | 676 | |
677 | 677 | if(scan_line) { |
678 | 678 | __DECL_VECTORIZED_LOOP |
@@ -747,7 +747,7 @@ void DISPLAY::GETVRAM_1_400L(int yoff, scrntype_t *p) | ||
747 | 747 | pixel = gvram_shadow[yoff_d]; |
748 | 748 | uint16_vec8_t *ppx = (uint16_vec8_t *)__builtin_assume_aligned(&(bit_trans_table_0[pixel][0]), 16); |
749 | 749 | __DECL_ALIGNED(16) uint16_vec8_t tmp_d; |
750 | - scrntype_vec8_t tmp_dd; | |
750 | + __DECL_ALIGNED(32) scrntype_vec8_t tmp_dd; | |
751 | 751 | scrntype_vec8_t *vp = (scrntype_vec8_t *)__builtin_assume_aligned(p, sizeof(scrntype_vec8_t)); |
752 | 752 | |
753 | 753 | tmp_d.v = ppx->v; |
@@ -770,7 +770,7 @@ void DISPLAY::GETVRAM_1_400L_GREEN(int yoff, scrntype_t *p) | ||
770 | 770 | pixel = gvram_shadow[yoff_d]; |
771 | 771 | uint16_vec8_t *ppx = (uint16_vec8_t *)__builtin_assume_aligned(&(bit_trans_table_0[pixel][0]), 16); |
772 | 772 | __DECL_ALIGNED(16) uint16_vec8_t tmp_d; |
773 | - scrntype_vec8_t tmp_dd; | |
773 | + __DECL_ALIGNED(32) scrntype_vec8_t tmp_dd; | |
774 | 774 | scrntype_vec8_t *vp = (scrntype_vec8_t *)__builtin_assume_aligned(p, sizeof(scrntype_vec8_t)); |
775 | 775 | |
776 | 776 | tmp_d.v = ppx->v; |
@@ -794,7 +794,7 @@ void DISPLAY::GETVRAM_4096(int yoff, scrntype_t *p, scrntype_t *px, | ||
794 | 794 | { |
795 | 795 | uint32_t b3, r3, g3; |
796 | 796 | uint8_t bb[4], rr[4], gg[4]; |
797 | - uint16_vec8_t pixels; | |
797 | + __DECL_ALIGNED(16) uint16_vec8_t pixels; | |
798 | 798 | __DECL_ALIGNED(16) const uint16_t __masks[8] = {(uint16_t)mask, (uint16_t)mask, (uint16_t)mask, (uint16_t)mask, (uint16_t)mask, (uint16_t)mask, (uint16_t)mask, (uint16_t)mask}; |
799 | 799 | scrntype_t b, r, g; |
800 | 800 | uint32_t idx;; |
@@ -841,7 +841,7 @@ void DISPLAY::GETVRAM_4096(int yoff, scrntype_t *p, scrntype_t *px, | ||
841 | 841 | #else |
842 | 842 | __DECL_ALIGNED(sizeof(scrntype_t) * 8) scrntype_t tmp_dd[16]; |
843 | 843 | #endif |
844 | - uint16_vec8_t tmp_g, tmp_r, tmp_b; | |
844 | + __DECL_ALIGNED(16) uint16_vec8_t tmp_g, tmp_r, tmp_b; | |
845 | 845 | __v8hi *vp0, *vp1, *vp2, *vp3; |
846 | 846 | // G |
847 | 847 | vp0 = (__v8hi*)__builtin_assume_aligned(&(bit_trans_table_0[gg[0]][0]), 16); |
@@ -895,7 +895,7 @@ __DECL_VECTORIZED_LOOP | ||
895 | 895 | tmp_dd[i * 2] = tmp_dd[i * 2 + 1] = analog_palette_pixel[pixels.w[i]];; |
896 | 896 | } |
897 | 897 | scrntype_vec8_t *vpx = (scrntype_vec8_t*)__builtin_assume_aligned(px, sizeof(scrntype_vec8_t)); |
898 | - scrntype_vec8_t vmask; | |
898 | + __DECL_ALIGNED(32) scrntype_vec8_t vmask; | |
899 | 899 | __DECL_VECTORIZED_LOOP |
900 | 900 | for(int i = 0; i < 2; i++) { |
901 | 901 | vp[i].v = dp[i].v; |
@@ -957,9 +957,9 @@ void DISPLAY::GETVRAM_256k(int yoff, scrntype_t *p, scrntype_t *px, bool scan_li | ||
957 | 957 | |
958 | 958 | uint8_t bb[8], rr[8], gg[8]; |
959 | 959 | |
960 | - uint16_vec8_t _btmp; | |
961 | - uint16_vec8_t _rtmp; | |
962 | - uint16_vec8_t _gtmp; | |
960 | + __DECL_ALIGNED(16) uint16_vec8_t _btmp; | |
961 | + __DECL_ALIGNED(16) uint16_vec8_t _rtmp; | |
962 | + __DECL_ALIGNED(16) uint16_vec8_t _gtmp; | |
963 | 963 | uint16_vec8_t *vp0, *vp1, *vp2, *vp3, *vp4, *vp5; |
964 | 964 | #if !defined(FIXED_FRAMEBUFFER_SIZE) |
965 | 965 | __DECL_ALIGNED(sizeof(scrntype_t) * 8) scrntype_t tmp_dd[8]; |
@@ -1088,7 +1088,7 @@ __DECL_VECTORIZED_LOOP | ||
1088 | 1088 | dp[i].v = dp[i].v >> 2; |
1089 | 1089 | #endif |
1090 | 1090 | } |
1091 | - scrntype_vec8_t scanline_data; | |
1091 | + __DECL_ALIGNED(32) scrntype_vec8_t scanline_data; | |
1092 | 1092 | __DECL_VECTORIZED_LOOP |
1093 | 1093 | for(int i = 0; i < 8; i++) { |
1094 | 1094 | scanline_data.w[i] = RGBA_COLOR(31, 31, 31, 255); |
@@ -99,7 +99,7 @@ void MEMORY::draw_screen() | ||
99 | 99 | dest[x] = (val & bit) ? col_w : col_b; |
100 | 100 | } |
101 | 101 | #else |
102 | - scrntype_vec8_t d; | |
102 | + __DECL_ALIGNED(32) scrntype_vec8_t d; | |
103 | 103 | for(int xx = 32; xx < (240 - 32); xx += 8) { |
104 | 104 | uint8_t val = ram[offset + (xx >> 3)]; |
105 | 105 | d = ConvertByteToPackedPixel_PixelTbl(val, &pixel_trans_table); |
@@ -31,7 +31,7 @@ private: | ||
31 | 31 | |
32 | 32 | bool inserted; |
33 | 33 | |
34 | - _bit_trans_table_scrn_t pixel_trans_table; | |
34 | + __DECL_ALIGNED(32) _bit_trans_table_scrn_t pixel_trans_table; | |
35 | 35 | public: |
36 | 36 | MEMORY(VM_TEMPLATE* parent_vm, EMU* parent_emu) : DEVICE(parent_vm, parent_emu) |
37 | 37 | { |