| Fortran DVM - contents | Part 1 (1-4) | Part 2 (5-6) | Part 3 (7-12) | Part 4 (Annexes) |
| created: april 2001 | - last edited 03.05.01 - |
2.2. Syntax of FDVM directives
| directive-line | is CDVM$ dvm-directive |
| or *DVM$ dvm-directive | |
| dvm-directive | is specification-directive |
| or executable-directive | |
| specification-directive | is processors-directive |
| or align-directive | |
| or distribute-directive | |
| or template-directive | |
| or pointer-directive | |
| or shadow-directive | |
| or dynamic-directive | |
| or inherit-directive | |
| or remote-group-directive | |
| or reduction-group-directive | |
| or task-directive | |
| or heap-directive | |
| or asyncid-directive | |
| executable-directive | is realign-directive |
| or redistribute-directive | |
| or parallel-directive | |
| or remote-access-directive | |
| or shadow-group-directive | |
| or shadow-start-directive | |
| or shadow-wait-directive | |
| or reduction-start-directive | |
| or reduction-wait-directive | |
| or new-value-directive | |
| or prefetch-directive | |
| or reset-directive | |
| or parallel-task-loop-directive | |
| or map-directive | |
| or task-region-directive | |
| or end-task-region-directive | |
| or on-directive | |
| or end-on-directive |
| or f90-directive | |
| or asynchronous-directive | |
| or end-asynchronous-directive | |
| or asyncwait-directive |
Constraints:
Definition. A specification expression is an expression where each primary is:
3. Virtual processor arrangements. PROCESSORS directive
| processors-directive | is PROCESSORS processors-decl-list |
| processors-decl | is processors-name ( explicit-shape-spec-list ) |
| explicit-shape-spec | is [ lower-bound : ] upper-bound |
| lower-bound | is int-expr |
| upper-bound | is int-expr |
4.1. DISTRIBUTE and REDISTRIBUTE directives
| distribute-directive | is dist-action distributee dist-directive-stuff |
| or dist-action [ dist-directive-stuff ] :: distributee-list | |
| dist-action | is DISTRIBUTE |
| or REDISTRIBUTE | |
| dist-directive-stuff | is dist-format-list [ dist-onto-clause ] |
| distributee | is array-name |
| or pointer-name | |
| or template-name | |
| dist-format | is BLOCK |
| or GEN_BLOCK ( block-size-array ) | |
| or WGT_BLOCK ( block-weight-array , nblock ) | |
| or * | |
| dist-onto-clause | is ONTO dist-target |
| dist-target | is processors-name [(processors-section-subscript-list )] |
| or task-name ( task-index ) | |
| processors-section-subscript | is [ subscript ] : [ subscript ] |
| subscript | is int-expr |
| nblock | is int-expr |
| block-size-array | is array-name |
| block-weight-array | is array-name |
Constraints:
4.2.2. Dynamic arrays in FDVM model. POINTER directive
| pointer-directive | is type , POINTER ( dimension-list ) :: pointer-name-list |
| dimension | is : |
| pointer-name | is scalar-int-variable-name |
| or int-array-name |
| heap-directive | is HEAP array-name-list |
4.3.1. ALIGN and REALIGN directives
| align-directive | is align-action alignee align-directive-stuff |
| or align-action [ align-directive-stuff ] :: alignee-list | |
| align-action | is ALIGN |
| or REALIGN | |
| align-directive-stuff | is ( align-source-list ) align-with-clause |
| alignee | is array-name |
| or pointer-name | |
| align-source | is * |
| or align-dummy | |
| align-dummy | is scalar-int-variable |
| align-with-clause | is WITH align-spec |
| align-spec | is align-target ( align-subscript-list ) |
| align-target | is array-name |
| or template-name | |
| or pointer-name | |
| align-subscript | is int-expr |
| or align-dummy-use | |
| or * | |
| align-dummy-use | is
[ primary-expr * ] align-dummy [ add-op primary-expr ] |
| primary-expr | is int-constant |
| or int-variable | |
| or ( int-expr ) | |
| add-op | is + |
| or - |
Constraints:
4.3.2. TEMPLATE directive
| template-directive | is TEMPLATE template-decl-list |
| template-decl | is template-name [ ( explicit-shape-spec-list ) ] |
4.4. DYNAMIC and NEW_VALUE directives
| dynamic-directive | is DYNAMIC alignee-or-distributee-list |
| alignee-or-distributee | is alignee |
| or distributee | |
| new-value-directive | is NEW_VALUE |
5.1.2. Distribution of loop iterations. PARALLEL directive
| parallel-directive | is PARALLEL ( do-variable-list ) ON iteration-align-spec [ , new-clause ] [ , reduction-clause] [ , shadow-renew-clause] [ , shadow-compute-clause] [ , remote-access-clause ] [ , across-clause ] |
| iteration-align-spec | is align-target ( iteration-align-subscript-list ) |
| iteration-align-subscript | is int-expr |
| or do-variable-use | |
| or * | |
| do-variable-use | is
[ primary-expr *
] do-variable [ add-op primary-expr] |
5.1.3. Private variables. NEW clause
| new-clause | is NEW ( new-variable-list ) |
| new-variable | is array-name |
| or scalar-variable-name |
Constraint: The distributed arrays cannot be used as NEW-variables.
5.1.4. Reduction operations and variables. REDUCTION clause
| reduction-clause | is
REDUCTION ( [ reduction-group-name : ] reduction-op-list ) |
| reduction-op | is reduction-op-name ( reduction-variable ) |
| or reduction-loc-name
( reduction-variable , location-variable, int-expr) |
|
| reduction-variable | is array-name |
| or scalar-variable-name | |
| location-variable | is array-name |
| reduction-op-name | is SUM |
| or PRODUCT | |
| or MAX | |
| or MIN | |
| or AND | |
| or OR | |
| or EQV | |
| or NEQV | |
| reduction-loc-name | is MAXLOC |
| or MINLOC |
Constraints:
6.2.1. Specification of array with shadow edges
| shadow-directive | is SHADOW dist-array ( shadow-edge-list ) |
| or SHADOW ( shadow-edge-list ) :: dist-array-list | |
| dist-array | is array-name |
| or pointer-name | |
| shadow-edge | is width |
| or low-width : high-width | |
| width | is int-expr |
| low-width | is int-expr |
| high-width | is int-expr |
Constraints:
6.2.2. Synchronous specification of independent references of SHADOW type for single loop
| shadow-renew-clause | is SHADOW_RENEW ( renewee-list ) |
| or shadow-start-directive | |
| or shadow-wait-directive | |
| renewee | is dist-array-name [ ( shadow-edge-list )] [ (CORNER) ] |
Constraints:
6.2.3. Computing values in shadow edges. SHADOW_COMPUTE clause
| shadow-compute-clause | is SHADOW_COMPUTE |
6.2.4. ACROSS specification of dependent references of SHADOW type for single loop
| across-clause | is ACROSS ( dependent-array-list ) |
| dependent-array | is dist-array-name ( dependence-list ) [(section-spec-list)] |
| dependence | is flow-dep-length : anti-dep-length |
| flow-dep-length | is int-constant |
| anti-dep-length | is int-constant |
| section-spec | is SECTION (section-subscript-list) |
Constraint:
6.2.5. Asynchronous specification of independent references of SHADOW type
| shadow-group-directive | is
SHADOW_GROUP shadow-group-name ( renewee-list ) |
| shadow-start-directive | is SHADOW_START shadow-group-name |
| shadow-wait-directive | is SHADOW_WAIT shadow-group-name |
Constraints:
6.3.1. REMOTE_ACCESS directive
| remote-access-directive | is
REMOTE_ACCESS ( [ remote-group-name : ] regular-reference-list) |
| regular-reference | is dist-array-name [( regular-subscript-list )] |
| regular-subscript | is int-expr |
| or do-variable-use | |
| or : | |
| remote-access-clause | is remote-access-directive |
6.3.3. Asynchronous specification of REMOTE type references
| remote-group-directive | is REMOTE_GROUP remote-group-name-list |
Constraint:
| prefetch-directive | is PREFETCH remote-group-name |
| reset-directive | is RESET remote-group-name |
Constraints:
6.3.4.2.1. ASYNCID directive
| asyncid-directive | is ASYNCID async-name-list |
6.3.4.2.2. F90 directive
| f90-directive | is F90 copy-statement |
| copy-statement | is array-section = array-section |
| array-section | is array-name [ ( section-subscript-list ) ] |
| section-subscript | is subscript |
| or subscript-triplet | |
| subscript-triplet | is [ subscript ] : [ subscript ] [ : stride] |
| subscript | is int-expr |
| stride | is int-expr |
6.3.4.2.3. ASYNCHRONOUS and END ASYNCHRONOUS directives
| asynchronous-construct | is asynchronous-directive |
| f90-directive [ f90-directive ] copy-loop [ copy-loop ] |
|
| end-asynchronous-directive | |
| asynchronous-directive | is ASYNCHRONOUS async-name |
| end-asynchronous-directive | is END ASYNCHRONOUS |
6.3.4.2.4. ASYNCWAIT directive
| asyncwait-directive | is ASYNCWAIT async-name |
6.4.2. Asynchronous specification of REDUCTION type references
| reduction-group-directive | is REDUCTION_GROUP reduction-group-name-list |
| reduction-start-directive | is REDUCTION_START reduction-group-name |
| reduction-wait-directive | is REDUCTION_WAIT reduction-group-name |
Constraints.
7.1. Declaration of task array
| task-directive | is TASK task-list |
| task | is task-name ( max-task ) |
7.2. Mapping tasks on processors. MAP directive
| map-directive | is MAP task-name ( task-index ) |
| ONTO processors-name(processors-section-subscript-list) |
7.4. Distribution of computations. TASK_REGION directive
| block-task-region | is task-region-directive |
| on-block | |
| [ on-block ]... | |
| end-task-region-directive | |
| task-region-directive | is TASK_REGION task-name [ , reduction-clause ] |
| end-task-region-directive | is END TASK_REGION |
| on-block | is on-directive |
| block | |
| end-on-directive | |
| on-directive | is ON task-name ( task-index ) [ , new-clause ] |
| end-on-directive | is END ON |
| loop-task-region | is task-region-directive |
| parallel-task-loop | |
| end-task-region-directive | |
| parallel-task-loop | is parallel-task-loop-directive |
| do-loop | |
| parallel-task-loop-directive | is
PARALLEL ( do-variable
) ON task-name ( do-variable ) [ , new-clause ] |
9. Procedures
| inherit-directive | is INHERIT dummy-array-name-list |
Seven small scientific programs are presented to illustrate Fortran DVM language features. They are intended for solving a system of linear equations:
A x = b
where:
A - matrix of coefficients,
b - vector of free members,
x - vector of unknowns.
The following basic methods are used for solving this system.
Direct methods. The well-known Gaussian Elimination method is the most commonly used algorithm of this class. The main idea of this algorithm is to reduce the matrix A to upper triangular form and then to use backward substitution to diagonalize the matrix.
Explicit iteration methods. Jacobi Relaxation is the most known algorithm of this class. The algorithm perform the following computation iteratively
xi,jnew = (xi-1,jold + xi,j-1old + xi+1,jold + xi,j+1old ) / 4
Implicit iteration methods. Successive Over Relaxation (SOR) refers to this class. The algorithm performs the following calculation iteratively
xi,jnew = ( w / 4 ) * (xi-1,jnew + xi,j-1new + xi+1,jold + xi,j+1old ) + (1-w) * xi,jold
By using red-black coloring of variables each step of SOR consists of two half Jacobi steps. One processes redvariables and the other processes black variables. Coloring of variables allows to overlap calculation and communication.
Example 1. Gauss elimination algorithm
PROGRAM GAUSS C Solving linear equation system A´x = b PARAMETER ( N = 100 ) REAL A( N, N+1 ), X( N ) C A : Coefficient matrix with dimension (N,N+1) C Right hand side vector of linear equations is stored C into last column (N+1)-th, of matrix A C X : Unknown vector C N : Number of linear equations *DVM$ DISTRIBUTE A (BLOCK,*) *DVM$ ALIGN X(I) WITH A(I,N+1) C C Initialization C *DVM$ PARALLEL ( I ) ON A(I,*) DO 100 I = 1, N DO 100 J = 1, N+1 IF (( I .EQ. J ) THEN A(I,J) = 2.0 ELSE IF ( J .EQ. N+1) THEN A(I,J) = 0.0 ENDIF ENDIF 100 CONTINUE C C Elimination C DO 1 I = 1, N C the I-th row of array A will be buffered before C execution of I-th iteration, and references A(I,K), A(I,I) C will be replaced with corresponding reference to buffer *DVM$ PARALLEL ( J ) ON A(J,*), REMOTE_ACCESS ( A(I,:) ) DO 5 J = I+1, N DO 5 K = I+1, N+1 A(J,K) = A(J,K) - A(J,I) * A(I,K) / A(I,I) 5 CONTINUE 1 CONTINUE C First calculate X(N) X(N) = A(N,N+1) / A(N,N) C C Solve X(N-1), X(N-2), ...,X(1) by backward substitution C DO 6 J = N-1, 1, -1 C the (J+1)-th elements of array X will be buffered before C execution of J-th iteration, and reference X(J+1) C will be replaced with reference to temporal variable *DVM$ PARALLEL ( I ) ON A(I,*), REMOTE_ACCESS ( X(J+1) ) DO 7 I = 1, J A(I,N+1) = A(I,N+1) - A(I,J+1) * X(J+1) 7 CONTINUE X(J) = A(J,N+1) / A(J,J) 6 CONTINUE PRINT *, X END
PROGRAM JACOB PARAMETER (K=8, ITMAX=20) REAL A(K,K), B(K,K), EPS, MAXEPS CDVM$ DISTRIBUTE A (BLOCK, BLOCK) CDVM$ ALIGN B(I,J) WITH A(I,J) C arrays A and B with block distribution PRINT *, '********** TEST_JACOBI **********' MAXEPS = 0.5E - 7 CDVM$ PARALLEL (J,I) ON A(I,J) C nest of two parallel loops, iteration (i,j) will be executed on C processor, which is owner of element A(i,j) DO 1 J = 1, K DO 1 I = 1, K A(I,J) = 0. IF(I.EQ.1 .OR. J.EQ.1 .OR. I.EQ.K .OR. J.EQ.K) THEN B(I,J) = 0. ELSE B(I,J) = 1. + I + J ENDIF 1 CONTINUE DO 2 IT = 1, ITMAX EPS = 0. CDVM$ PARALLEL (J,I) ON A(I,J), REDUCTION ( MAX( EPS )) C variable EPS is used for calculation of maximum value DO 21 J = 2, K-1 DO 21 I = 2, K-1 EPS = MAX ( EPS, ABS( B(I,J) - A(I,J))) A(I,J) = B(I,J) 21 CONTINUE CDVM$ PARALLEL (J,I) ON B(I,J), SHADOW_RENEW (A) C copying shadow elements of array A from C neighboring processors before loop execution DO 22 J = 2, K-1 DO 22 I = 2, K-1 B(I,J) = (A(I-1,J) + A(I,J-1) + A(I+1,J) + A(I,J+1)) / 4 22 CONTINUE PRINT *, 'IT = ', IT, ' EPS = ', EPS IF ( EPS . LT . MAXEPS ) GO TO 3 2 CONTINUE 3 OPEN (3, FILE='JACOBI.DAT', FORM='FORMATTED') WRITE (3,*) B CLOSE (3) END
Example 3. Jacobi algorithm (asynchronous version)
PROGRAM JACOB1 PARAMETER (K=8, ITMAX=20) REAL A(K,K), B(K,K), EPS, MAXEPS CDVM$ DISTRIBUTE A (BLOCK, BLOCK) CDVM$ ALIGN B(I,J) WITH A(I,J) C arrays A and B with block distribution CDVM$ REDUCTION_GROUP REPS PRINT *, '********** TEST_JACOBI_ASYNCHR **********' CDVM$ SHADOW_GROUP SA (A) C creation of shadow edge group MAXEPS = 0.5E - 7 CDVM$ PARALLEL (J,I) ON A(I,J) C nest of two parallel loops, iteration (i,j) will be executed on C processor, which is owner of element A(i,j) DO 1 J = 1, K DO 1 I = 1, K A(I,J) = 0. IF(I.EQ.1 .OR. J.EQ.1 .OR. I.EQ.K .OR. J.EQ.K) THEN B(I,J) = 0. ELSE B(I,J) = 1. + I + J ENDIF 1 CONTINUE DO 2 IT = 1, ITMAX EPS = 0. C group of reduction operations is created C and initial values of reduction variables are stored CDVM$ PARALLEL (J,I) ON A(I,J), SHADOW_START SA, CDVM$* REDUCTION_GROUP ( REPS : MAX( EPS )) C the loops iteration order is changed: C at first boundary elements of A are calculated and sent, C then internal elements of array A are calculated DO 21 J = 2, K-1 DO 21 I = 2, K-1 EPS = MAX ( EPS, ABS( B(I,J) - A(I,J))) A(I,J) = B(I,J) 21 CONTINUE CDVM$ REDUCTION_START REPS C start of reduction operation to accumulate the partial results C calculated in copies of variable EPS on every processor CDVM$ PARALLEL (J,I) ON B(I,J), SHADOW_WAIT SA C the loops iteration order is changed: C at first internal elements of B are calculated, C then shadow edge elements of array A from neighboring processors C are received, then boundary elements of array B are calculated DO 22 J = 2, K-1 DO 22 I = 2, K-1 B(I,J) = (A(I-1,J) + A(I,J-1) + A(I+1,J) + A(I,J+1)) / 4 22 CONTINUE CDVM$ REDUCTION_WAIT REPS C waiting completion of reduction operation PRINT *, 'IT = ', IT, ' EPS = ', EPS IF ( EPS . LT . MAXEPS ) GO TO 3 2 CONTINUE 3 OPEN (3, FILE='JACOBI.DAT', FORM='FORMATTED') WRITE (3,*) B CLOSE (3) END
Example 4. Successive over-relaxation
PROGRAM SOR
PARAMETER ( N = 100 )
REAL A( N, N ), EPS, MAXEPS, W
INTEGER ITMAX
*DVM$ DISTRIBUTE A (BLOCK,BLOCK)
ITMAX = 20
MAXEPS = 0.5E - 5
W = 0.5
*DVM$ PARALLEL (I,J) ON A(I,J)
DO 1 I = 1, N
DO 1 J = 1, N
IF ( I .EQ.J) THEN
A(I,J) = N + 2
ELSE
A(I,J) = -1.0
ENDIF
1 CONTINUE
DO 2 IT = 1, ITMAX
EPS = 0.
*DVM$ PARALLEL (I,J) ON A(I,J), NEW (S),
*DVM$* REDUCTION ( MAX( EPS )), ACROSS (A(1:1,1:1))
C S variable private variable
C (its usage is localized in the range of one iteration)
C EPS variable is used for maximum calculation
DO 21 I = 2, N-1
DO 21 J = 2, N-1
S = A(I,J)
A(I,J) = (W / 4) * (A(I-1,J) + A(I+1,J) + A(I,J-1) +
* A(I,J+1)) + ( 1-W ) * A(I,J)
EPS = MAX ( EPS, ABS( S - A(I,J)))
21 CONTINUE
PRINT *, 'IT = ', IT, ' EPS = ', EPS
IF (EPS .LT. MAXEPS ) GO TO 4
2 CONTINUE
4 PRINT *, A
END
Example 5. Red-black successive over-relaxation
PROGRAM REDBLACK
PARAMETER ( N = 100 )
REAL A( N, N ), EPS, MAXEPS, W
INTEGER ITMAX
*DVM$ DISTRIBUTE A (BLOCK,BLOCK)
ITMAX = 20
MAXEPS = 0.5E - 5
W = 0.5
*DVM$ PARALLEL (I,J) ON A(I,J)
DO 1 I = 1, N
DO 1 J = 1, N
IF ( I .EQ.J) THEN
A(I,J) = N + 2
ELSE
A(I,J) = -1.0
ENDIF
1 CONTINUE
DO 2 IT = 1, ITMAX
EPS = 0.
C loop for red and black variables
DO 3 IRB = 1,2
*DVM$ PARALLEL (I,J) ON A(I,J), NEW (S),
*DVM$* REDUCTION ( MAX( EPS )), SHADOW_RENEW (A)
C variable S - private variable in loop iterations
C variable EPS is used for calculation of maximum value
C Exception : iteration space is not rectangular
DO 21 I = 2, N-1
DO 21 J = 2 + MOD( I+ IRB, 2 ), N-1, 2
S = A(I,J)
A(I,J) = (W / 4) * (A(I-1,J) + A(I+1,J) + A(I,J-1) +
* A(I,J+1)) + ( 1-W ) * A(I,J)
EPS = MAX ( EPS, ABS( S - A(I,J)))
21 CONTINUE
3 CONTINUE
PRINT *, 'IT = ', IT, ' EPS = ', EPS
IF (EPS .LT. MAXEPS ) GO TO 4
2 CONTINUE
4 PRINT *, A
END
Example 6. Static tasks (parallel sections)
PROGRAM TASKS C rectangular grid is subdivided on two blocks
| C | K | |
| C | N1 | A1, B1 |
| C | N2 | A2, B2 |
C PARAMETER (K=100, N1 = 50, ITMAX=10, N2 = K N1 ) CDVM$ PROCESSORS P(NUMBER_OF_PROCESSORS( )) REAL A1(N1+1,K), A2(N2+1,K), B1(N1+1,K), B2(N2+1,K) INTEGER LP(2), HP(2) CDVM$ TASK MB( 2 ) CDVM$ ALIGN B1(I,J) WITH A1(I,J) CDVM$ ALIGN B2(I,J) WITH A2(I,J) CDVM$ DISTRIBUTE :: A1, A2 CDVM$ REMOTE_GROUP BOUND CALL DPT(LP, HP, 2) C Task (block) distribution over processors C Array distribution over tasks CDVM$ MAP MB( 1 ) ONTO P( LP(1) : HP(1) ) CDVM$ REDISTRIBUTE A1( *, BLOCK ) ONTO MB( 1 ) CDVM$ MAP MB( 2 ) ONTO P( LP(2) : HP(2) ) CDVM$ REDISTRIBUTE A2(*,BLOCK) ONTO MB( 2 ) C Initialization CDVM$ PARALLEL (J,I) ON A1(I,J) DO 10 J = 1, K DO 10 I = 1, N1 IF(I.EQ.1 .OR. J.EQ.1 .OR. J.EQ.K) THEN A1(I,J) = 0. B1(I,J) = 0. ELSE B1(I,J) = 1. + I + J A1(I,J) = B1(I, J) ENDIF 10 CONTINUE CDVM$ PARALLEL (J,I) ON A2(I,J) DO 20 J = 1, K DO 20 I = 2, N2+1 IF(I.EQ.N2+1 .OR. J.EQ.1 .OR. J.EQ.K) THEN A2(I,J) = 0. B2(I,J) = 0. ELSE B2(I,J) = 1. + ( I + N1 1 ) + J A2(I,J) = B2(I,J) ENDIF 20 CONTINUE DO 2 IT = 1, ITMAX CDVM$ PREFETCH BOUND C exchange of edges CDVM$ PARALLEL ( J ) ON A1(N1+1, J), CDVM$* REMOTE_ACCESS (BOUND : B2(2,J) ) DO 30 J = 1, K 30 A1(N1+1, J) = B2(2, J) CDVM$ PARALLEL ( J ) ON A2(1,J), CDVM$* REMOTE_ACCESS (BOUND : B1(N1,J) ) DO 40 J = 1, K 40 A2(1,J) = B1(N1,J) CDVM$ TASK_REGION MB CDVM$ ON MB( 1 ) CDVM$ PARALLEL (J,I) ON B1(I,J), CDVM$* SHADOW_RENEW ( A1 ) DO 50 J = 2, K-1 DO 50 I = 2, N1 50 B1(I,J) = (A1(I-1,J) + A1(I,J-1) + A1(I+1,J) + A1(I,J+1)) / 4 CDVM$ PARALLEL (J,I) ON A1(I,J) DO 60 J = 2, K-1 DO 60 I = 2, N1 60 A1(I,J) = B1(I,J) CDVM$ END ON CDVM$ ON MB( 2 ) CDVM$ PARALLEL (J,I) ON B2(I,J), CDVM$* SHADOW_RENEW ( A2 ) DO 70 J = 2, K-1 DO 70 I = 2, N2 70 B2(I,J) = (A2(I-1,J) + A2(I,J-1) + A2(I+1,J) + A2(I,J+1)) / 4 CDVM$ PARALLEL (J,I) ON A2(I,J) DO 80 J = 2, K-1 DO 80 I = 2, N2 80 A2(I,J) = B2(I,J) CDVM$ END ON CDVM$ END TASK_REGION 2 CONTINUE PRINT *, 'A1 ' PRINT *, A1 PRINT *, 'A2 ' PRINT *, A2 END SUBROUTINE DPT( LP, HP, NT ) C processor distribution for NT tasks (NT = 2) INTEGER LP(2), HP(2) NUMBER_OF_PROCESSORS( ) = 1 NP = NUMBER_OF_PROCESSORS( ) NTP = NP/NT IF(NP.EQ.1) THEN LP(1) = 1 HP(1) = 1 LP(2) = 1 HP(2) = 1 ELSE LP(1) = 1 HP(1) = NTP LP(2) = NTP+1 HP(2) = NP END IF END
Example 7. Dynamic tasks (task loop)
PROGRAM MULTIBLOCK
C Model of multi-block task.
C The number of blocks, size of each block,
C external and internal edges
C are defined during program execution.
C Test of following FDVM constructs: dynamic arrays,
C dynamic tasks, asynchronous REMOTE_ACCESS for dynamic
C arrays (formal arguments)
*DVM$ PROCESSORS MBC100( NUMBER_OF_PROCESSORS( ) )
PARAMETER (M = 8, N =8, NTST = 1)
C MXSIZE dynamic memory size
C MXBL maximal number of blocks
PARAMETER ( MXS=10000 )
PARAMETER ( MXBL=2 )
C HEAP dynamic memory
REAL HEAP(MXS)
C PA,PB arrays of pointers for dynamic arrays
C PA(I),PB(I) function value on previous and current step
C in I-th block
*DVM$ REAL, POINTER (:,:) :: PA, PB, P1, P2
*DVM$ DYNAMIC PA, PB, P1, P2
INTEGER PA(MXBL), PB(MXBL), P1, P2
C SIZE( 1:2, I) sizes of dimensions of I-th block
INTEGER SIZE( 2, MXBL ) , ALLOCATE
C TINB( :,I ) table of internal edges of I-th block
C TINB( 1,I ) - - the number of edges (from 1 till 4)
C TINB( 2,I ) = J - adjacent block number
C TINB( 3,I ),TINB( 4,I ) - edges of one-dimensional section
C TINB( 5,I ) - dimension number in I-th block (1 or 2)
C TINB( 6,I ) - dimension coordinate in I-th block
C TINB( 7,I ) - dimension number in J-th block (1 or 2)
C TINB( 8,I ) - dimension coordinate in J-th block
INTEGER TINB( 29, MXBL )
C TEXB( :,I ) table of external edges of I-th block
C TEXB( 1,I ) - (îò 1 äî 4) edges amount (from 1 to 4)
C TEXB( 2,I ),TEXB( 3,I ) - coordinates of one-dimensional array
C section for 1-th edge
C TEXB( 4,I ) - dimension number (1 or 2)
C TEXB( 5,I ) - coordinate of given dimension
INTEGER TEXB(17,MXBL)
C NBL - the number of blocks
C NTST the number of steps
INTEGER NBL, NTST
C IDM pointer to free dynamic memory
INTEGER IDM
COMMON IDM,MXSIZE
C postponed distribution of arrays on each block
*DVM$ DISTRIBUTE :: PA, P1
*DVM$ ALIGN :: PB, P2
C task array
*DVM$ TASK TSA ( MXBL )
C name of group exchange of internal edges
*DVM$ REMOTE_GROUP GRINB
C LP( I ), HP( I ) edges of processor array section of I-th block
INTEGER LP(MXBL), HP(MXBL)
C TGLOB( :, I ) table of global coordinates
C in Jacobi algorithm grid for I-th block
C TGLOB( 1, I ) 1-th dimension coordinate
C TGLOB( 2, I ) 2-th dimension coordinate
INTEGER TGLOB(2,MXBL)
MXSIZE = MXS
C subdividing M*N block on sub-blocks
CALL DISDOM(NBL,TGLOB,TEXB,TINB,SIZE,M,N,MXBL)
C Dividing processor array on blocks
CALL MPROC(LP,HP,SIZE,NBL)
C Distribution of tasks (blocks) over processors.
C Array distribution over tasks
IDM = 1
DO 10 IB = 1, NBL
*DVM$ MAP TSA( IB ) ONTO MBC100( LP(IB) : HP(IB) )
PA(IB) = ALLOCATE ( SIZE(1,IB))
P1 = PA(IB)
*DVM$ REDISTRIBUTE (*,BLOCK) ONTO TSA(IB) :: P1
PB(IB) = ALLOCATE ( SIZE(1,IB))
P2 = PB(I)
*DVM$ REALIGN P2(I,J) WITH P1(I,J)
10 CONTINUE
C External edge initialization
DO 20 IB=1,NBL
LS = 0
DO 20 IS = 1,TEXB(1,IB)
CALL INEXB (HEAP(PA(IB)), HEAP(PB(IB)), SIZE(1,IB), SIZE(2,IB),
* TEXB(LS+2,IB), TEXB(LS+3,IB), TEXB(LS+4,IB), TEXB(LS+5,IB) )
LS = LS+4
20 CONTINUE
C Initialization of blocks
DO 25 IB = 1,NBL
CALL INDOM (HEAP(PA(IB)), HEAP(PB(IB)), SIZE(1,IB), SIZE(2,IB),
* TGLOB(1,IB), TGLOB(2,IB))
LS = LS+4
25 CONTINUE
DO 65 IB = 1,NBL
CALL PRTB(HEAP(PA(IB)), SIZE(1,IB), SIZE(2,IB ),IB)
65 CONTINUE
C Iteration loop
DO 30 IT = 1, NTST
C surpassed pumping of buffers for internal edges
*DVM$ PREFETCH GRINB
C value calculation on internal edges
DO 40 IB = 1, NBL
LS = 0
DO 40 IS = 1, TINB(1,IB)
J = TINB(LS+2, IB)
CALL CMPINB (HEAP(PA(IB)), HEAP(PA(J)),
* SIZE(1,IB), SIZE(2,IB), SIZE(1,J), SIZE(2,J),
* TINB(LS+3,IB), TINB(LS+4,IB), TINB(LS+5,IB),
* TINB(LS+6,IB), TINB(LS+7,IB), TINB(LS+8,IB) )
LS = LS+7
40 CONTINUE
C value calculation inside blocks
C each block is a task
*DVM$ TASK_REGION TSA
*DVM$ PARALLEL ( IB ) ON TSA( IB )
DO 50 IB = 1,NBL
CALL JACOBI(HEAP(PA(IB)), HEAP(PB(IB)), SIZE(1,IB), SIZE(2,IB))
50 CONTINUE
*DVM$ END TASK_REGION
30 CONTINUE
C end of iterations
C output of array values
DO 60 IB = 1,NBL
CALL PRTB(HEAP(PA(IB)), SIZE(1,IB), SIZE(2,IB ),IB)
60 CONTINUE
END
INTEGER FUNCTION ALLOCATE( SIZE )
C dynamic array distribution for sequential execution
INTEGER SIZE(2)
COMMON IDM,MXSIZE
ALLOCATE = IDM
IDM = IDM + SIZE(1)*SIZE(2)
IF(IDM.GT.MXSIZE) THEN
PRINT *, 'NO MEMORY'
STOP
ENDIF
RETURN
END
SUBROUTINE CMPINB ( AI, AJ, N1, N2, M1, M2, S1, S2,
* ID, INDI, JD, INDJ)
C value calculation on internal edges
DIMENSION AI(N1,N2), AJ(M1, M2)
INTEGER S1, S2
*DVM$ INHERIT AI, AJ
*DVM$ REMOTE_GROUP GRINB
IF ( ID .EQ. 1 ) THEN
IF ( JD .EQ. 1 ) THEN
*DVM$ PARALLEL ( K ) ON AI(INDI,K),
*DVM$* REMOTE_ACCESS (GRINB : AJ(INDJ,K) )
DO 10 K = S1,S2
10 AI(INDI,K) = AJ(INDJ,K)
ELSE
*DVM$ PARALLEL ( K ) ON AI( INDI, K ),
*DVM$* REMOTE_ACCESS (GRINB : AJ(K,INDJ) )
DO 20 K = S1, S2
20 AI(INDI,K) = AJ(K,INDJ)
ENDIF
ELSE
IF ( JD .EQ. 1 ) THEN
*DVM$ PARALLEL ( K ) ON AI(K,INDI),
*DVM$* REMOTE_ACCESS (GRINB : AJ(INDJ,K) )
DO 30 K = S1,S2
30 AI(K, INDI) = AJ(INDJ,K)
ELSE
*DVM$ PARALLEL ( K ) ON AI(K,INDI),
*DVM$* REMOTE_ACCESS (GRINB : AJ(K,INDJ) )
DO 40 K = S1, S2
40 AI(K,INDI) = AJ(K,INDJ)
ENDIF
ENDIF
END
SUBROUTINE MPROC(LP,HP,SIZE,NBL)
C processor distribution over blocks
INTEGER LP(NBL),HP(NBL),SIZE(2,NBL)
C distribution for two blocks NBL=2
NUMBER_OF_PROCESSORS( ) = 1
NP = NUMBER_OF_PROCESSORS( )
NPT = NP/NBL
IF(NP.EQ.1) THEN
LP(1) = 1
HP(1) = 1
LP(2) = 1
HP(2) = 1
ELSE
LP(1) = 1
HP(1) = NPT
LP(2) = NPT+1
HP(2) = NP
ENDIF
END
SUBROUTINE INEXB(A,B,N1,N2,S1,S2,ID,INDI)
C external edge initialization
DIMENSION A(N1,N2),B(N1,N2)
INTEGER S1,S2
*DVM$ INHERIT A,B
IF(ID.EQ.1) THEN
*DVM$ PARALLEL (K) ON A(INDI,K)
DO 10 K = S1,S2
A(INDI,K) = 0
B(INDI,K) = 0
10 CONTINUE
ELSE
*DVM$ PARALLEL (K) ON A(K,INDI)
DO 20 K = S1,S2
A(K,INDI) = 0
B(K,INDI) = 0
20 CONTINUE
ENDIF
END
SUBROUTINE INDOM(A,B,M,N,X1,X2)
C block initialization
DIMENSION A(M,N), B(M,N)
INTEGER X1,X2
*DVM$ INHERIT A,B
*DVM$ PARALLEL (I,J) ON A(I,J)
DO 10 I = 2,M-1
DO 10 J = 2,N-1
A(I,J) = I+J+X1+X2-3
B(I,J) = A(I,J)
10 CONTINUE
END
SUBROUTINE JACOBI(A,B,N,M)
DIMENSION A(N,M), B(N,M)
*DVM$ INHERIT A,B
*DVM$ PARALLEL (I,J) ON B(I,J)
DO 10 I = 2,N-1
DO 10 J = 2,M-1
10 B(I,J) = (A(I-1,J)+A(I+1,J)+A(I,J-1)+A(I,J+1))/4
*DVM$ PARALLEL (I,J) ON A(I,J)
DO 20 I = 2,N-1
DO 20 J = 2,M-1
20 A(I,J) = B(I,J)
END
SUBROUTINE PRTB(B,N,M,IB)
C print data for IB block
DIMENSION B(N,M)
*DVM$ INHERIT B
PRINT *, 'BLOCK', IB
PRINT *, B
END
SUBROUTINE DISDOM (NBL,TGL,TEXB,TINB,SIZE,M,N,MXBL)
INTEGER TGL(2,MXBL), TEXB(17,MXBL), TINB(29,MXBL), SIZE(2,MXBL)
INTEGER DM(20), DN(20),KDM,KDN,S,GM,GN
C subdividing M*N block on two sub-blocks M*(N/2) and M* (N-N/2)
DM(1) = M
KDM = 1
DN(1) = N/2
DN(2) = N - N/2
KDN = 2
S = 0
DO 10 I = 1,KDM
10 S = S + DM(I)
IF(S.NE.M) THEN
PRINT *, 'wrong division M'
STOP
ENDIF
DO 15 IB = 1,MXBL
TEXB(1,IB) = 0
TINB(1,IB) = 0
15 CONTINUE
S = 0
DO 20 J = 1,KDN
20 S = S + DN(J)
IF(S.NE.N) THEN
PRINT *, 'wrong division N'
STOP
ENDIF
DM(1) = DM(1) - 1
DN(1) = DN(1) - 1
DM(KDM) = DM(KDM) - 1
DN(KDN) = DN(KDN) - 1
C producing tables (graphs) of external and internal edges
IB = 1
GM = 2
GN = 2
DO 30 J = 1,KDN
DO 40 I = 1,KDM
IF (I.EQ.1) THEN
L = TEXB(1,IB)*4
TEXB(L+2,IB) = 1
TEXB(L+3,IB) = DN(J)+2
TEXB(L+4,IB) = 1
TEXB(L+5,IB) = 1
TEXB(1,IB) = TEXB(1,IB)+1
ELSE
L = TINB(1,IB)*7
TINB(L+2,IB) = IB-1
TINB(L+3,IB) = 1
TINB(L+4,IB) = DN(J)+2
TINB(L+5,IB) = 1
TINB(L+6,IB) = 1
TINB(L+7,IB) = 1
TINB(L+8,IB) = DM(I-1)+1
TINB(1,IB) = TINB(1,IB)+1
ENDIF
IF (I.EQ.KDM) THEN
L = TEXB(1,IB)*4
TEXB(L+2,IB) = 1
TEXB(L+3,IB) = DN(J)+2
TEXB(L+4,IB) = 1
TEXB(L+5,IB) = DM(I)+2
TEXB(1,IB) = TEXB(1,IB)+1
ELSE
L = TINB(1,IB)*7
TINB(2,IB) = IB+1
TINB(3,IB) = 1
TINB(4,IB) = DN(J)+2
TINB(5,IB) = 1
TINB(6,IB) = DM(I)+2
TINB(7,IB) = 1
TINB(8,IB) = 2
TINB(1,IB) = TINB(1,IB)+1
ENDIF
IF (J.EQ.1) THEN
L = TEXB(1,IB)*4
TEXB(L+2,IB) = 1
TEXB(L+3,IB) = DM(I)+2
TEXB(L+4,IB) = 2
TEXB(L+5,IB) = 1
TEXB(1,IB) = TEXB(1,IB)+1
ELSE
L = TINB(1,IB)*7
TINB(L+2,IB) = IB-KDM
TINB(L+3,IB) = 1
TINB(L+4,IB) = DM(I)+2
TINB(L+5,IB) = 2
TINB(L+6,IB) = 1
TINB(L+7,IB) = 2
TINB(L+8,IB) = DN(J-1)+1
TINB(1,IB) = TINB(1,IB)+1
ENDIF
IF (J.EQ.KDN) THEN
L = TEXB(1,IB)*4
TEXB(L+2,IB) = 1
TEXB(L+3,IB) = DM(I)+2
TEXB(L+4,IB) = 2
TEXB(L+5,IB) = DN(J)+2
TEXB(1,IB) = TEXB(1,IB)+1
ELSE
L = TINB(1,IB)*7
TINB(L+2,IB) = IB+KDM
TINB(L+3,IB) = 1
TINB(L+4,IB) = DM(I)+2
TINB(L+5,IB) = 2
TINB(L+6,IB) = DN(J)+2
TINB(L+7,IB) = 2
TINB(L+8,IB) = 2
TINB(1,IB) = TINB(1,IB)+1
ENDIF
SIZE(1,IB) = DM(I)+2
SIZE(2,IB) = DN(J)+2
TGL(1,IB) = GM
TGL(2,IB) = GN
GM = GM+DM(I)
IB = IB+1
40 CONTINUE
GM = 2
GN = GN+DN(J)
30 CONTINUE
NBL = IB-1
END
| Fortran DVM - contents | Part 1 (1-4) | Part 2 (5-6) | Part 3 (7-12) | Part 4 (Annexes) |