Hi Archaeologist,
The command line being used is: cl6x -qq --gcc -O3 -mi200 -mv6600 -mt -mw -oi -pdr <various include paths> <various defines> --abi=eabi -k -fr <temp dir> -fs <temp dir> -ft <temp dir> <filename>
Regarding your questions above:
- Yes, as you spotted GetIndexInBitField returns the LS bit set and clears that bit in its parameter (so aEnabledBitfield)
- As you spotted, no
- Again, as you spotted, yes (see 1.)
- No
- Yes, there is a circular buffer of arrays of structures (all in another file, as is the definition of this function) and this function looks up the circular buffer entry using the first parameter and, then, looks up the array element using the second parameter, returning a pointer. Note that all this state is private to another file.
- No
Regarding point 5, I've made this refer to a function that I've declared but not defined (so not state dependence and no side effects) and the assembler produced is the same.
OK, here is the assembler produced with -os. The call immediately before the while loop is a memcpy (to copy the bitfield that is iterated over (and destroyed by) the loop so I've started from that call...
CALLP .S2 memcpy,B3
|| ADDAW .D1X SP,112,A4 ; |1566|
|| MVK .S1 0x80,A6 ; |1566|
$C$RL69: ; CALL OCCURS {memcpy} {0} ; |1566|
;** --------------------------------------------------------------------------*
; EXCLUSIVE CPU CYCLES: 2
MVK .S2 0x10,B5 ; |219|
SUB .L2 B5,2,B5
|| ADDAW .D2 SP,110,B4
;*----------------------------------------------------------------------------*
;* SOFTWARE PIPELINE INFORMATION
;*
;* Loop found in file : fg_ctrl.c
;* Loop inlined from : ctrl_utils.h
;* Loop source line : 219
;* Loop opening brace source line : 220
;* Loop closing brace source line : 225
;* Loop Unroll Multiple : 2x
;* Known Minimum Trip Count : 16
;* Known Maximum Trip Count : 16
;* Known Max Trip Count Factor : 16
;* Loop Carried Dependency Bound(^) : 2
;* Unpartitioned Resource Bound : 1
;* Partitioned Resource Bound(*) : 2
;* Resource Partition:
;* A-side B-side
;* .L units 0 0
;* .S units 0 0
;* .D units 1 1
;* .M units 0 0
;* .X cross paths 0 0
;* .T address paths 1 1
;* Long read paths 0 0
;* Long write paths 0 0
;* Logical ops (.LS) 2 0 (.L or .S unit)
;* Addition ops (.LSD) 2 0 (.L or .S or .D unit)
;* Bound(.L .S .LS) 1 0
;* Bound(.L .S .D .LS .LSD) 2* 1
;*
;* Searching for software pipeline schedule at ...
;* ii = 2 Schedule found with 4 iterations in parallel
;*
;* Register Usage Table:
;* +-----------------------------------------------------------------+
;* |AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA|BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB|
;* |00000000001111111111222222222233|00000000001111111111222222222233|
;* |01234567890123456789012345678901|01234567890123456789012345678901|
;* |--------------------------------+--------------------------------|
;* 0: | *** |* ** |
;* 1: | *** |* ** |
;* +-----------------------------------------------------------------+
;*
;* Done
;*
;* Loop will be splooped
;* Collapsed epilog stages : 0
;* Collapsed prolog stages : 0
;* Minimum required memory pad : 0 bytes
;*
;* For further improvement on this loop, try option -mh8
;*
;* Minimum safe trip count : 1 (after unrolling)
;* Min. prof. trip count (est.) : 2 (after unrolling)
;*
;* Mem bank conflicts/iter(est.) : { min 0.000, est 0.000, max 0.000 }
;* Mem bank perf. penalty (est.) : 0.0%
;*
;*
;* Total cycles (est.) : 6 + min_trip_cnt * 2 = 38
;*----------------------------------------------------------------------------*
;* SETUP CODE
;*
;* MV B5,B4
;* ADD 12,B4,B4
;* ADD 8,B5,B5
;*
;* SINGLE SCHEDULED ITERATION
;*
;* $C$C1902:
;* 0 LDW .D2T2 *B5++(8),B0 ; |223|
;* 1 LDW .D2T2 *B4++(8),B0 ; |223|
;* 2 NOP 3
;* 5 [ B0] MIN2 .L1 A3,A4,A4 ; |223| ^
;* 6 [ B0] MIN2 .L1 A5,A4,A4 ; |223| ^
;* || ADD .D1 2,A5,A5 ; |219|
;* || ADD .S1 2,A3,A3 ; |219|
;* || SPBR $C$C1902
;* 7 NOP 1
;* 8 ; BRANCHCC OCCURS {$C$C1902} ; |219|
;*----------------------------------------------------------------------------*
$C$L71: ; PIPED LOOP PROLOG
; EXCLUSIVE CPU CYCLES: 7
.dwpsn file "ctrl_utils.h",line 219,column 0,is_stmt,isa 0
SPLOOPD 2 ;8 ; (P)
|| MVC .S2 B5,ILC
|| ADD .L2 8,B4,B5
;** --------------------------------------------------------------------------*
$C$L72: ; PIPED LOOP KERNEL
$C$DW$L$ulc_FgrCtrl$74$B:
.dwpsn file "ctrl_utils.h",line 220,column 0,is_stmt,isa 0
; EXCLUSIVE CPU CYCLES: 2
SPMASK L2
|| ADD .L2 12,B4,B4
|| LDW .D2T2 *B5++(8),B0 ; |223| (P) <0,0>
LDW .D2T2 *B4++(8),B0 ; |223| (P) <0,1>
NOP 2
SPMASK L1,S1
|| ZERO .L1 A3 ; |219|
|| MV .S1X B10,A4 ; |212|
SPMASK S1
|| MVK .S1 0x1,A5
|| [ B0] MIN2 .L1 A3,A4,A4 ; |223| (P) <0,5> ^
.dwpsn file "ctrl_utils.h",line 225,column 0,is_stmt,isa 0
SPKERNEL 0,0
|| ADD .S1 2,A3,A3 ; |219| <0,6>
|| ADD .D1 2,A5,A5 ; |219| <0,6>
|| [ B0] MIN2 .L1 A5,A4,A4 ; |223| <0,6> ^
$C$DW$L$ulc_FgrCtrl$74$E:
;** --------------------------------------------------------------------------*
$C$L73: ; PIPED LOOP EPILOG
; EXCLUSIVE CPU CYCLES: 5
;** 210 ----------------------- Index = K$217; // [109]
;** 219 ----------------------- K$226 = 32u; // [109]
;** ----------------------- goto g60;
MVK .S2 0x3e8,B4 ; |1518|
MVK .S1 32,A6 ; |227|
NOP 1
MVK .S1 0x20,A10 ; |219|
NOP 1
;** --------------------------------------------------------------------------*
; EXCLUSIVE CPU CYCLES: 7
MV .S1X SP,A3 ; |230| Register A/B partition copy
|| CMPLTU .L1 A4,A6,A0 ; |227|
[ A0] MVK .S1 448,A5 ; |230|
|| ADDAW .D1 A3,A4,A3 ; |230|
|| [!A0] B .S2 $C$L74
ADD .L1 A5,A3,A5 ; |230|
[ A0] LDW .D1T1 *A5,A3 ; |230|
NOP 3
; BRANCHCC OCCURS {$C$L74}
;** --------------------------------------------------------------------------*
; EXCLUSIVE CPU CYCLES: 9
;** -----------------------g59:
;** 230 ----------------------- C$205 = &aEnabledBitField[WordFound]; // [109]
;** 230 ----------------------- C$206 = *C$205; // [109]
;** 230 ----------------------- (C$206&1u) ? (C$207 = 0u) : (C$207 = _norm((int)_bitr(C$206&0xfffffffeu))+1u); // [109]
;** 230 ----------------------- Index = (WordFound<<5)+C$207; // [109]
;** 231 ----------------------- *C$205 = ~(1u<<C$207)&C$206; // [109]
;** 230 ----------------------- K$226 = 32u; // [109]
MVK .L2 1,B4 ; |231|
|| SHL .S2X A4,5,B31 ; |230|
AND .L1 -2,A3,A6 ; |230|
BITR .M1 A6,A6 ; |230|
AND .L1 1,A3,A0 ; |230|
NORM .L1 A6,A6 ; |230|
[!A0] ADD .L1 1,A6,A6 ; |230|
|| [ A0] ZERO .S1 A6 ; |230|
SHL .S1X B4,A6,A7 ; |231|
ANDN .L1 A3,A7,A3 ; |231|
|| ADD .L2X A6,B31,B4 ; |230|
STW .D1T1 A3,*A5 ; |231|
;** --------------------------------------------------------------------------*
$C$L74:
; EXCLUSIVE CPU CYCLES: 7
;** -----------------------g60:
;** 234 ----------------------- Index = Index; // [109]
;* 1568 ----------------------- if ( Index > K$269 ) goto g68;
;** ----------------------- #pragma LOOP_FLAGS(5120u)
CMPGTU .L2 B4,B12,B0 ; |1568|
[ B0] BNOP .S1 $C$L79,4 ; |1568|
MV .L1X B4,A11 ; |234|
; BRANCHCC OCCURS {$C$L79} ; |1568|
;** --------------------------------------------------------------------------*
;** BEGIN LOOP $C$L75
;** --------------------------------------------------------------------------*
$C$L75:
$C$DW$L$ulc_FgrCtrl$79$B:
; EXCLUSIVE CPU CYCLES: 6
;** -----------------------g62:
;* 1574 ----------------------- C$204 = Index*148;
;* 1574 ----------------------- *(C$204+(struct $fake35 **)K$180+192) = GetBuffAddr(Absolute, 0u);
;* 1578 ----------------------- (**((struct $fake36 *)K$180+C$204+192)).Index = 10u;
;** 212 ----------------------- WordFound = K$226; // [109]
;** 219 ----------------------- L$4 = 16; // [109]
;** ----------------------- U$381 = 1u;
;** ----------------------- U$378 = &aEnabledBitField[-2];
;** 219 ----------------------- WordIndex = 0u; // [109]
;** ----------------------- #pragma MUST_ITERATE(16, 16, 16)
;** ----------------------- #pragma UNROLL(1)
;** ----------------------- // LOOP BELOW UNROLLED BY FACTOR(2)
;** ----------------------- #pragma LOOP_FLAGS(4099u)
;** -----------------------g63:
;** 223 ----------------------- (*(U$378 += 2)) ? (WordFound = _min2((int)WordIndex, (int)WordFound)) : WordFound; // [109]
;** 223 ----------------------- (U$378[1]) ? (WordFound = _min2((int)U$381, (int)WordFound)) : WordFound; // [109]
;** 219 ----------------------- U$381 += 2u; // [109]
;** 219 ----------------------- WordIndex += 2u; // [109]
;** 219 ----------------------- if ( !__builtin_expect((long)!(L$4 = L$4-1), 0L) ) goto g63; // [109]
;** 227 ----------------------- if ( WordFound < 32u ) goto g66; // [109]
$C$DW$657 .dwtag DW_TAG_TI_branch
.dwattr $C$DW$657, DW_AT_low_pc(0x00)
.dwattr $C$DW$657, DW_AT_name("GetBuffAddr")
.dwattr $C$DW$657, DW_AT_TI_call
CALL .S1 GetBuffAddr ; |1574|
ADDKPC .S2 $C$RL70,B3,3 ; |1574|
ZERO .L2 B4 ; |1574|
|| MV .L1X B11,A4 ; |1574|
$C$RL70: ; CALL OCCURS {GetBuffAddr} {0} ; |1574|
$C$DW$L$ulc_FgrCtrl$79$E:
;** --------------------------------------------------------------------------*
$C$DW$L$ulc_FgrCtrl$80$B:
; EXCLUSIVE CPU CYCLES: 14
MVK .S1 148,A3 ; |1574|
MPY32 .M1 A3,A11,A5 ; |1574|
MVKL .S2 s_ControlParams,B4
MVKH .S2 s_ControlParams,B4
MVKL .S1 s_ControlParams,A31
MVKH .S1 s_ControlParams,A31
ADD .L2X A5,B4,B4 ; |1578|
ADDK .S2 192,B4 ; |1578|
LDW .D2T2 *B4,B5 ; |1578|
ADD .L1 A31,A5,A3 ; |1574|
MVK .L2 10,B7 ; |1578|
MVK .S2 0x10,B30 ; |219|
ADDAW .D2 SP,110,B31
STW .D2T2 B7,*B5 ; |1578|
|| SUB .L2 B30,2,B7
|| ADDK .S1 192,A3 ; |1574|
|| MVC .S2 B30,RILC
$C$DW$L$ulc_FgrCtrl$80$E:
;*----------------------------------------------------------------------------*
;* SOFTWARE PIPELINE INFORMATION
;*
;* Loop found in file : fg_ctrl.c
;* Loop inlined from : ctrl_utils.h
;* Loop source line : 219
;* Loop opening brace source line : 220
;* Loop closing brace source line : 225
;* Loop Unroll Multiple : 2x
;* Known Minimum Trip Count : 16
;* Known Maximum Trip Count : 16
;* Known Max Trip Count Factor : 16
;* Loop Carried Dependency Bound(^) : 2
;* Unpartitioned Resource Bound : 1
;* Partitioned Resource Bound(*) : 2
;* Resource Partition:
;* A-side B-side
;* .L units 0 0
;* .S units 0 0
;* .D units 1 1
;* .M units 0 0
;* .X cross paths 0 0
;* .T address paths 1 1
;* Long read paths 0 0
;* Long write paths 0 0
;* Logical ops (.LS) 2 0 (.L or .S unit)
;* Addition ops (.LSD) 2 0 (.L or .S or .D unit)
;* Bound(.L .S .LS) 1 0
;* Bound(.L .S .D .LS .LSD) 2* 1
;*
;* Searching for software pipeline schedule at ...
;* ii = 2 Schedule found with 4 iterations in parallel
;*
;* Register Usage Table:
;* +-----------------------------------------------------------------+
;* |AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA|BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB|
;* |00000000001111111111222222222233|00000000001111111111222222222233|
;* |01234567890123456789012345678901|01234567890123456789012345678901|
;* |--------------------------------+--------------------------------|
;* 0: | *** |* ** |
;* 1: | *** |* ** |
;* +-----------------------------------------------------------------+
;*
;* Done
;*
;* Loop will be splooped
;* Collapsed epilog stages : 0
;* Collapsed prolog stages : 0
;* Minimum required memory pad : 0 bytes
;*
;* For further improvement on this loop, try option -mh8
;*
;* Minimum safe trip count : 1 (after unrolling)
;* Min. prof. trip count (est.) : 2 (after unrolling)
;*
;* Mem bank conflicts/iter(est.) : { min 0.000, est 0.000, max 0.000 }
;* Mem bank perf. penalty (est.) : 0.0%
;*
;*
;* Total cycles (est.) : 6 + min_trip_cnt * 2 = 38
;*----------------------------------------------------------------------------*
;* SETUP CODE
;*
;* MV B5,B4
;* ADD 12,B4,B4
;* ADD 8,B5,B5
;*
;* SINGLE SCHEDULED ITERATION
;*
;* $C$C1866:
;* 0 LDW .D2T2 *B5++(8),B0 ; |223|
;* 1 LDW .D2T2 *B4++(8),B0 ; |223|
;* 2 NOP 3
;* 5 [ B0] MIN2 .L1 A3,A4,A4 ; |223| ^
;* 6 [ B0] MIN2 .L1 A5,A4,A4 ; |223| ^
;* || ADD .D1 2,A5,A5 ; |219|
;* || ADD .S1 2,A3,A3 ; |219|
;* || SPBR $C$C1866
;* 7 NOP 1
;* 8 ; BRANCHCC OCCURS {$C$C1866} ; |219|
;*----------------------------------------------------------------------------*
$C$L76: ; PIPED LOOP PROLOG
; EXCLUSIVE CPU CYCLES: 7
.dwpsn file "ctrl_utils.h",line 219,column 0,is_stmt,isa 0
SPLOOPD 2 ;8 ; (P)
|| STW .D1T1 A4,*A3 ; |1574|
|| ADD .L2 8,B31,B5
|| MVC .S2 B7,ILC
;** --------------------------------------------------------------------------*
$C$L77: ; PIPED LOOP KERNEL
$C$DW$L$ulc_FgrCtrl$82$B:
.dwpsn file "ctrl_utils.h",line 220,column 0,is_stmt,isa 0
; EXCLUSIVE CPU CYCLES: 2
SPMASK L2
|| ADD .L2 12,B31,B4
|| LDW .D2T2 *B5++(8),B0 ; |223| (P) <0,0>
LDW .D2T2 *B4++(8),B0 ; |223| (P) <0,1>
NOP 2
SPMASK L1,S1
|| ZERO .L1 A3 ; |219|
|| MV .S1 A10,A4 ; |212|
SPMASK S1
|| MVK .S1 0x1,A5
|| [ B0] MIN2 .L1 A3,A4,A4 ; |223| (P) <0,5> ^
.dwpsn file "ctrl_utils.h",line 225,column 0,is_stmt,isa 0
SPKERNEL 0,0
|| ADD .S1 2,A3,A3 ; |219| <0,6>
|| ADD .D1 2,A5,A5 ; |219| <0,6>
|| [ B0] MIN2 .L1 A5,A4,A4 ; |223| <0,6> ^
$C$DW$L$ulc_FgrCtrl$82$E:
;** --------------------------------------------------------------------------*
$C$L78: ; PIPED LOOP EPILOG
; EXCLUSIVE CPU CYCLES: 5
;** 210 ----------------------- Index = K$217; // [109]
;** 219 ----------------------- K$226 = 32u; // [109]
;** ----------------------- goto g67;
;** -----------------------g66:
;** 230 ----------------------- C$201 = &aEnabledBitField[WordFound]; // [109]
;** 230 ----------------------- C$202 = *C$201; // [109]
;** 230 ----------------------- (C$202&1u) ? (C$203 = 0u) : (C$203 = _norm((int)_bitr(C$202&0xfffffffeu))+1u); // [109]
;** 230 ----------------------- Index = (WordFound<<5)+C$203; // [109]
;** 231 ----------------------- *C$201 = ~(1u<<C$203)&C$202; // [109]
;** 230 ----------------------- K$226 = 32u; // [109]
;** -----------------------g67:
;** 234 ----------------------- Index = Index; // [109]
;* 1568 ----------------------- if ( Index < K$217 ) goto g62;
MVK .S2 32,B7 ; |227|
MVK .S1 0x3e8,A11 ; |1518|
|| MVK .S2 448,B5 ; |230|
NOP 1
MVK .S1 0x3e8,A30 ; |1518|
NOP 1
;** --------------------------------------------------------------------------*
$C$DW$L$ulc_FgrCtrl$84$B:
; EXCLUSIVE CPU CYCLES: 26
MVK .L1 1,A3 ; |231|
MV .L2X A4,B6
ADDAW .D2 SP,B6,B4 ; |230|
|| CMPLTU .L2 B6,B7,B1 ; |227|
ADD .L2 B5,B4,B4 ; |230|
[ B1] LDW .D2T2 *B4,B13 ; |230|
SHL .S1X B6,5,A31 ; |230|
[!B1] ZERO .L2 B2 ; |230|
[ B1] MVK .S1 0x20,A10 ; |230|
[!B1] MVK .S1 0x20,A10 ; |219|
[ B1] AND .L2 1,B13,B2 ; |230|
MV .L2 B2,B0 ; |230|
[!B1] MVK .L2 0x1,B0 ; |230|
[!B0] AND .L2 -2,B13,B5 ; |230|
[!B0] BITR .M2 B5,B5 ; |230|
[ B2] ZERO .L2 B10 ; |230|
[!B0] NORM .L2 B5,B5 ; |230|
[!B0] ADD .L2 1,B5,B10 ; |230|
[ B1] SHL .S2X A3,B10,B5 ; |231|
[ B1] ADD .L1X B10,A31,A11 ; |230|
|| [ B1] ANDN .L2 B13,B5,B5 ; |231|
CMPLTU .L1 A11,A30,A0 ; |1568|
|| [ B1] STW .D2T2 B5,*B4 ; |231|
[ A0] BNOP .S1 $C$L75,5 ; |1568|
; BRANCHCC OCCURS {$C$L75} ; |1568|
$C$DW$L$ulc_FgrCtrl$84$E:
;** --------------------------------------------------------------------------*
Thanks for your help,
SPH.