state_machine.yaml 35 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027
  1. AWSTemplateFormatVersion: "2010-09-09"
  2. Transform: AWS::Serverless-2016-10-31
  3. Description: Amazon S3 Find and Forget State Machine
  4. Globals:
  5. Function:
  6. Runtime: python3.9
  7. Timeout: 900
  8. Tracing: Active
  9. Layers: !Ref CommonLayers
  10. Environment:
  11. Variables:
  12. DataMapperTable: !Ref DataMapperTableName
  13. DeletionQueueTable: !Ref DeletionQueueTableName
  14. GlueDatabase: !Ref GlueDatabase
  15. LogLevel: !Ref LogLevel
  16. JobManifestsGlueTable: !Ref JobManifestsGlueTable
  17. JobTable: !Ref JobTableName
  18. StateBucket: !Ref ResultBucket
  19. ManifestsBucket: !Ref ManifestsBucket
  20. Parameters:
  21. AthenaWorkGroup:
  22. Description: WorkGroup to use for Athena queries
  23. Type: String
  24. Default: primary
  25. CommonLayers:
  26. Type: CommaDelimitedList
  27. Description: Common layers supplied to all functions
  28. DataMapperTableName:
  29. Description: Table name for Data Mapper Table
  30. Type: String
  31. DeleteQueueUrl:
  32. Type: String
  33. DeleteServiceName:
  34. Type: String
  35. DeletionQueueTableName:
  36. Type: String
  37. ECSCluster:
  38. Type: String
  39. GlueDatabase:
  40. Type: String
  41. JobTableName:
  42. Description: Table name for Jobs Table
  43. Type: String
  44. LogLevel:
  45. Type: String
  46. Default: INFO
  47. AllowedValues:
  48. - CRITICAL
  49. - FATAL
  50. - ERROR
  51. - WARNING
  52. - INFO
  53. - DEBUG
  54. - NOTSET
  55. JobManifestsGlueTable:
  56. Type: String
  57. ManifestsBucket:
  58. Type: String
  59. ResultBucket:
  60. Type: String
  61. StateMachinePrefix:
  62. Type: String
  63. Resources:
  64. StatesExecutionRole:
  65. Type: "AWS::IAM::Role"
  66. Properties:
  67. AssumeRolePolicyDocument:
  68. Version: "2012-10-17"
  69. Statement:
  70. - Effect: "Allow"
  71. Principal:
  72. Service:
  73. - !Sub states.${AWS::Region}.amazonaws.com
  74. Action: "sts:AssumeRole"
  75. Path: "/"
  76. Policies:
  77. - PolicyName: StatesExecutionPolicy
  78. PolicyDocument:
  79. Version: "2012-10-17"
  80. Statement:
  81. - Effect: Allow
  82. Action: "lambda:InvokeFunction"
  83. Resource:
  84. - !GetAtt CheckQueueSize.Arn
  85. - !GetAtt ExecuteQuery.Arn
  86. - !GetAtt CheckQueryStatus.Arn
  87. - !GetAtt CheckTaskCount.Arn
  88. - !GetAtt SubmitQueryResults.Arn
  89. - !GetAtt GenerateQueries.Arn
  90. - !GetAtt OrchestrateECSServiceScaling.Arn
  91. - !GetAtt WorkQueryQueue.Arn
  92. - !GetAtt DeleteQueueMessage.Arn
  93. - !GetAtt PurgeQueue.Arn
  94. - !GetAtt EmitEvent.Arn
  95. - Effect: Allow
  96. Action:
  97. - "events:PutTargets"
  98. - "events:PutRule"
  99. - "events:DescribeRule"
  100. Resource: !Sub arn:${AWS::Partition}:events:${AWS::Region}:${AWS::AccountId}:rule/*
  101. - Effect: Allow
  102. Action: "states:ListStateMachines"
  103. Resource: "*"
  104. - Effect: Allow
  105. Action:
  106. - "states:DescribeExecution"
  107. - "states:DescribeStateMachineForExecution"
  108. Resource: !Sub "arn:${AWS::Partition}:states:${AWS::Region}:${AWS::AccountId}:execution:${StateMachinePrefix}*:*"
  109. - Effect: Allow
  110. Action:
  111. - "states:DescribeStateMachine"
  112. - "states:ListExecutions"
  113. - "states:StartExecution"
  114. Resource: !Sub "arn:${AWS::Partition}:states:${AWS::Region}:${AWS::AccountId}:stateMachine:${StateMachinePrefix}*"
  115. AthenaStateMachine:
  116. Type: AWS::StepFunctions::StateMachine
  117. Properties:
  118. StateMachineName: !Sub ${StateMachinePrefix}-AthenaStateMachine
  119. RoleArn: !GetAtt StatesExecutionRole.Arn
  120. DefinitionString: !Sub |-
  121. {
  122. "StartAt": "Execute Query",
  123. "States": {
  124. "Execute Query": {
  125. "Comment": "Start an Athena query asynchronously",
  126. "Type": "Task",
  127. "Parameters": {
  128. "QueryData.$": "$",
  129. "Bucket": "${ResultBucket}",
  130. "Prefix": "queries"
  131. },
  132. "Resource": "${ExecuteQuery.Arn}",
  133. "ResultPath": "$.QueryId",
  134. "Next": "Get Query Status",
  135. "Retry": [{
  136. "ErrorEquals": [ "States.ALL" ],
  137. "IntervalSeconds": 10,
  138. "BackoffRate": 10,
  139. "MaxAttempts": 2
  140. }],
  141. "Catch": [{
  142. "ErrorEquals": ["States.ALL"],
  143. "ResultPath": "$.ErrorDetails",
  144. "Next": "Handle Error"
  145. }]
  146. },
  147. "Wait for Query": {
  148. "Comment": "Waits before checking again whether Athena is done",
  149. "Type": "Wait",
  150. "SecondsPath": "$.WaitDuration",
  151. "Next": "Get Query Status"
  152. },
  153. "Get Query Status": {
  154. "Comment": "Gets the status of the given Athena query",
  155. "Type": "Task",
  156. "Resource": "${CheckQueryStatus.Arn}",
  157. "Next": "Query Complete?",
  158. "Retry": [{
  159. "ErrorEquals": [ "States.ALL" ],
  160. "IntervalSeconds": 10,
  161. "BackoffRate": 10,
  162. "MaxAttempts": 2
  163. }],
  164. "Catch": [{
  165. "ErrorEquals": ["States.ALL"],
  166. "ResultPath": "$.ErrorDetails",
  167. "Next": "Handle Error"
  168. }]
  169. },
  170. "Query Complete?": {
  171. "Comment": "Check if the Athena query is still running",
  172. "Type": "Choice",
  173. "Choices": [
  174. {
  175. "Variable": "$.State",
  176. "StringEquals": "SUCCEEDED",
  177. "Next": "Submit Query Results"
  178. },
  179. {
  180. "Or": [
  181. {
  182. "Variable": "$.State",
  183. "StringEquals": "FAILED"
  184. },
  185. {
  186. "Variable": "$.State",
  187. "StringEquals": "CANCELLED"
  188. }
  189. ],
  190. "Next": "Retriable?"
  191. }
  192. ],
  193. "Default": "Wait for Query"
  194. },
  195. "Retriable?": {
  196. "Comment": "Check if the Athena query can be retried",
  197. "Type": "Choice",
  198. "Choices": [
  199. {
  200. "Variable": "$.ExecutionRetriesLeft",
  201. "NumericGreaterThan": 0,
  202. "Next": "Execute Query"
  203. }
  204. ],
  205. "Default": "Raise Query Error"
  206. },
  207. "Raise Query Error": {
  208. "Type": "Pass",
  209. "Parameters": {
  210. "Cause.$": "$.Reason",
  211. "Error": "Query Failed"
  212. },
  213. "ResultPath": "$.ErrorDetails",
  214. "Next": "Handle Error"
  215. },
  216. "Handle Error": {
  217. "Type": "Pass",
  218. "Parameters": {
  219. "Error.$": "$.ErrorDetails.Error",
  220. "Cause.$": "$.ErrorDetails.Cause",
  221. "State.$": "$"
  222. },
  223. "Next": "Emit Error"
  224. },
  225. "Emit Error": {
  226. "Comment": "Emit the failure event",
  227. "Type": "Task",
  228. "Parameters": {
  229. "EventName": "QueryFailed",
  230. "EventData.$": "$",
  231. "EmitterId": "StepFunctions",
  232. "JobId.$": "$.State.JobId"
  233. },
  234. "Resource": "${EmitEvent.Arn}",
  235. "ResultPath": null,
  236. "Next": "Query Failed"
  237. },
  238. "Query Failed": {
  239. "Comment": "The query was unsuccessful",
  240. "Type": "Fail"
  241. },
  242. "Submit Query Results": {
  243. "Comment": "Obtain the query results from S3 and send them to Fargate",
  244. "Type": "Task",
  245. "Resource": "${SubmitQueryResults.Arn}",
  246. "ResultPath": null,
  247. "Next": "Query Succeeded",
  248. "Retry": [{
  249. "ErrorEquals": [ "States.ALL" ],
  250. "MaxAttempts": 0
  251. }],
  252. "Catch": [{
  253. "ErrorEquals": ["States.ALL"],
  254. "ResultPath": "$.ErrorDetails",
  255. "Next": "Handle Error"
  256. }]
  257. },
  258. "Query Succeeded": {
  259. "Comment": "The query was successful",
  260. "Type": "Pass",
  261. "Parameters": {
  262. "JobId.$": "$.JobId",
  263. "QueryId.$": "$.QueryId",
  264. "PartitionKeys.$": "$.PartitionKeys",
  265. "Statistics.$": "$.Statistics",
  266. "DataMapperId.$": "$.DataMapperId",
  267. "Table.$": "$.Table"
  268. },
  269. "Next": "Emit Success"
  270. },
  271. "Emit Success": {
  272. "Comment": "Emit the successful query event",
  273. "Type": "Task",
  274. "Parameters": {
  275. "EventName": "QuerySucceeded",
  276. "EventData.$": "$",
  277. "EmitterId": "StepFunctions",
  278. "JobId.$": "$.JobId"
  279. },
  280. "Resource": "${EmitEvent.Arn}",
  281. "ResultPath": null,
  282. "End": true,
  283. "Retry": [{
  284. "ErrorEquals": [ "States.ALL" ],
  285. "IntervalSeconds": 3,
  286. "BackoffRate": 1.5,
  287. "MaxAttempts": 1
  288. }]
  289. }
  290. }
  291. }
  292. DeleteStateMachine:
  293. Type: AWS::StepFunctions::StateMachine
  294. Properties:
  295. StateMachineName: !Sub ${StateMachinePrefix}-DeletionStateMachine
  296. RoleArn: !GetAtt StatesExecutionRole.Arn
  297. DefinitionString: !Sub |-
  298. {
  299. "StartAt": "Fetch Queue Size",
  300. "States": {
  301. "Fetch Queue Size": {
  302. "Comment": "Checks the number of messages in the Object Deletion Queue",
  303. "Type": "Task",
  304. "Resource": "${CheckQueueSize.Arn}",
  305. "Parameters": {
  306. "QueueUrl": "${DeleteQueueUrl}"
  307. },
  308. "ResultPath": "$.Queue",
  309. "Next": "Adjust Deletion Service Instance Count",
  310. "Retry": [{
  311. "ErrorEquals": [ "States.ALL" ],
  312. "IntervalSeconds": 10,
  313. "MaxAttempts": 1
  314. }]
  315. },
  316. "Adjust Deletion Service Instance Count": {
  317. "Comment": "Sets the desired instance count based on Object Deletion Queue size",
  318. "Type": "Task",
  319. "Resource": "${OrchestrateECSServiceScaling.Arn}",
  320. "Parameters": {
  321. "Cluster": "${ECSCluster}",
  322. "DeleteService": "${DeleteServiceName}",
  323. "DeletionTasksMaxNumber.$": "$.DeletionTasksMaxNumber",
  324. "QueueSize.$": "$.Queue.Total"
  325. },
  326. "ResultPath": "$.DesiredCount",
  327. "Next": "Items in Queue?",
  328. "Retry": [{
  329. "ErrorEquals": [ "States.ALL" ],
  330. "IntervalSeconds": 10,
  331. "MaxAttempts": 1
  332. }]
  333. },
  334. "Items in Queue?": {
  335. "Comment": "Checks if any tasks are being created/terminated",
  336. "Type": "Choice",
  337. "Choices": [
  338. {
  339. "Variable": "$.DesiredCount",
  340. "NumericEquals": 0,
  341. "Next": "Fetch Task Count"
  342. },
  343. {
  344. "Variable": "$.DesiredCount",
  345. "NumericGreaterThan": 0,
  346. "Next": "Wait"
  347. }
  348. ],
  349. "Default": "Not sure, fail"
  350. },
  351. "Wait": {
  352. "Comment": "Waits before checking if the Object Deletion Queue is empty",
  353. "Type": "Wait",
  354. "SecondsPath": "$.WaitDuration",
  355. "Next": "Fetch Queue Size Again"
  356. },
  357. "Fetch Queue Size Again": {
  358. "Comment": "Checks the number of messages in the Object Deletion Queue",
  359. "Type": "Task",
  360. "Resource": "${CheckQueueSize.Arn}",
  361. "Parameters": {
  362. "QueueUrl": "${DeleteQueueUrl}"
  363. },
  364. "ResultPath": "$.Queue",
  365. "Next": "Queue is Empty?",
  366. "Retry": [{
  367. "ErrorEquals": [ "States.ALL" ],
  368. "IntervalSeconds": 10,
  369. "MaxAttempts": 1
  370. }]
  371. },
  372. "Queue is Empty?": {
  373. "Comment": "Checks if the Total messages are 0",
  374. "Type": "Choice",
  375. "Choices": [
  376. {
  377. "Variable": "$.Queue.Total",
  378. "NumericEquals": 0,
  379. "Next": "Adjust Deletion Service Instance Count"
  380. },
  381. {
  382. "Variable": "$.Queue.Total",
  383. "NumericGreaterThan": 0,
  384. "Next": "Wait"
  385. }
  386. ],
  387. "Default": "Not sure, fail"
  388. },
  389. "Fetch Task Count": {
  390. "Comment": "Checks the number of tasks for a service",
  391. "Type": "Task",
  392. "Resource": "${CheckTaskCount.Arn}",
  393. "Parameters": {
  394. "Cluster": "${ECSCluster}",
  395. "ServiceName": "${DeleteServiceName}"
  396. },
  397. "ResultPath": "$.TaskCount",
  398. "Next": "Has Fargate Shutdown?",
  399. "Retry": [{
  400. "ErrorEquals": [ "States.ALL" ],
  401. "IntervalSeconds": 10,
  402. "MaxAttempts": 1
  403. }]
  404. },
  405. "Has Fargate Shutdown?": {
  406. "Comment": "Checks if the remaining tasks is 0",
  407. "Type": "Choice",
  408. "Choices": [
  409. {
  410. "Variable": "$.TaskCount.Total",
  411. "NumericEquals": 0,
  412. "Next": "Done"
  413. },
  414. {
  415. "Variable": "$.TaskCount.Total",
  416. "NumericGreaterThan": 0,
  417. "Next": "Wait for Fargate Shutdown"
  418. }
  419. ],
  420. "Default": "Not sure, fail"
  421. },
  422. "Wait for Fargate Shutdown": {
  423. "Comment": "Waits before checking if Fargate has reached the desired count",
  424. "Type": "Wait",
  425. "Seconds": 10,
  426. "Next": "Fetch Task Count"
  427. },
  428. "Not sure, fail": {
  429. "Type": "Fail"
  430. },
  431. "Done": {
  432. "Type": "Succeed"
  433. }
  434. }
  435. }
  436. StateMachine:
  437. Type: AWS::StepFunctions::StateMachine
  438. Properties:
  439. StateMachineName: !Sub ${StateMachinePrefix}-StateMachine
  440. DefinitionString: !Sub |-
  441. {
  442. "Comment": "State machine for processing the S3 Find and Forget deletion queue.",
  443. "StartAt": "Start Job",
  444. "States": {
  445. "Start Job": {
  446. "Type": "Pass",
  447. "Parameters": {
  448. "ExecutionId.$": "$$.Execution.Id",
  449. "ExecutionName.$": "$$.Execution.Name",
  450. "AthenaConcurrencyLimit.$": "$.AthenaConcurrencyLimit",
  451. "AthenaQueryMaxRetries.$": "$.AthenaQueryMaxRetries",
  452. "DeletionTasksMaxNumber.$": "$.DeletionTasksMaxNumber",
  453. "ForgetQueueWaitSeconds.$": "$.ForgetQueueWaitSeconds",
  454. "QueryExecutionWaitSeconds.$": "$.QueryExecutionWaitSeconds",
  455. "QueryQueueWaitSeconds.$": "$.QueryQueueWaitSeconds"
  456. },
  457. "Next": "Emit Job Started"
  458. },
  459. "Emit Job Started": {
  460. "Type": "Task",
  461. "Parameters": {
  462. "EventName": "JobStarted",
  463. "EventData.$": "$$.State.EnteredTime",
  464. "EmitterId": "StepFunctions",
  465. "JobId.$": "$$.Execution.Name"
  466. },
  467. "Resource": "${EmitEvent.Arn}",
  468. "ResultPath": null,
  469. "Next": "Purge Queues",
  470. "Catch": [{
  471. "ErrorEquals": ["States.ALL"],
  472. "ResultPath": "$.ErrorDetails",
  473. "Next": "Handle Error"
  474. }]
  475. },
  476. "Purge Queues": {
  477. "Comment": "Purges the query and object deletion queues. Purges wait 61 seconds before retrying to avoid exceeding the max purge attempt rate for SQS.",
  478. "Type": "Parallel",
  479. "Next": "Start Find Phase",
  480. "ResultPath": null,
  481. "Branches": [{
  482. "StartAt": "Purge Query Queue",
  483. "States": {
  484. "Purge Query Queue": {
  485. "Parameters": {
  486. "QueueUrl": "${QueryQueue}"
  487. },
  488. "Comment": "Purge the query queue",
  489. "Type": "Task",
  490. "Resource": "${PurgeQueue.Arn}",
  491. "End": true,
  492. "Retry": [{
  493. "ErrorEquals": [ "States.ALL" ],
  494. "IntervalSeconds": 61,
  495. "MaxAttempts": 1
  496. }]
  497. }
  498. }
  499. }, {
  500. "StartAt": "Purge Deletion Queue",
  501. "States": {
  502. "Purge Deletion Queue": {
  503. "Parameters": {
  504. "QueueUrl": "${DeleteQueueUrl}"
  505. },
  506. "Comment": "Purge the deletion queue.",
  507. "Type": "Task",
  508. "Resource": "${PurgeQueue.Arn}",
  509. "End": true,
  510. "Retry": [{
  511. "ErrorEquals": [ "States.ALL" ],
  512. "IntervalSeconds": 61,
  513. "MaxAttempts": 1
  514. }]
  515. }
  516. }
  517. }],
  518. "Catch": [{
  519. "ErrorEquals": ["States.ALL"],
  520. "ResultPath": "$.ErrorDetails",
  521. "Next": "Handle Error"
  522. }]
  523. },
  524. "Start Find Phase": {
  525. "Type": "Task",
  526. "Parameters": {
  527. "EventName": "FindPhaseStarted",
  528. "EventData.$": "$$.State.EnteredTime",
  529. "EmitterId": "StepFunctions",
  530. "JobId.$": "$$.Execution.Name"
  531. },
  532. "Resource": "${EmitEvent.Arn}",
  533. "Next": "Generate Queries",
  534. "ResultPath": null,
  535. "Catch": [{
  536. "ErrorEquals": ["States.ALL"],
  537. "ResultPath": "$.ErrorDetails",
  538. "Next": "Handle Error"
  539. }]
  540. },
  541. "Generate Queries": {
  542. "Comment": "Process each of the data mappers and populate the query queue",
  543. "Type": "Task",
  544. "Resource": "${GenerateQueries.Arn}",
  545. "ResultPath": "$.QueriesStats",
  546. "Next": "End Query Planning",
  547. "Catch": [{
  548. "ErrorEquals": ["States.ALL"],
  549. "ResultPath": "$.ErrorDetails",
  550. "Next": "Handle Find Error"
  551. }]
  552. },
  553. "End Query Planning": {
  554. "Type": "Task",
  555. "Parameters": {
  556. "EventName": "QueryPlanningComplete",
  557. "EventData.$": "$.QueriesStats",
  558. "EmitterId": "StepFunctions",
  559. "JobId.$": "$$.Execution.Name"
  560. },
  561. "Resource": "${EmitEvent.Arn}",
  562. "ResultPath": null,
  563. "Next": "Work Queue",
  564. "Catch": [{
  565. "ErrorEquals": ["States.ALL"],
  566. "ResultPath": "$.ErrorDetails",
  567. "Next": "Handle Find Error"
  568. }]
  569. },
  570. "Work Queue": {
  571. "Comment": "Works the Object Deletion Queue by starting Athena State Machine executions",
  572. "Type": "Task",
  573. "Resource": "${WorkQueryQueue.Arn}",
  574. "ResultPath": "$.RunningExecutions",
  575. "Next": "Outstanding Queries?",
  576. "Catch": [{
  577. "ErrorEquals": ["States.ALL"],
  578. "ResultPath": "$.ErrorDetails",
  579. "Next": "Handle Find Error"
  580. }]
  581. },
  582. "Outstanding Queries?": {
  583. "Comment": "Checks if any queries are yet to be ran or are in progress",
  584. "Type": "Choice",
  585. "Choices": [
  586. {
  587. "Variable": "$.RunningExecutions.Total",
  588. "NumericEquals": 0,
  589. "Next": "End Find Phase"
  590. },
  591. {
  592. "Variable": "$.RunningExecutions.Total",
  593. "NumericGreaterThan": 0,
  594. "Next": "Wait for Queries"
  595. }
  596. ],
  597. "Default": "Wait for Queries"
  598. },
  599. "Wait for Queries": {
  600. "Comment": "Waits before rechecking if the Object Deletion Queue is empty",
  601. "Type": "Wait",
  602. "SecondsPath": "$.QueryQueueWaitSeconds",
  603. "Next": "Work Queue"
  604. },
  605. "End Find Phase": {
  606. "Type": "Task",
  607. "Parameters": {
  608. "EventName": "FindPhaseEnded",
  609. "EventData.$": "$$.State.EnteredTime",
  610. "EmitterId": "StepFunctions",
  611. "JobId.$": "$$.Execution.Name"
  612. },
  613. "Resource": "${EmitEvent.Arn}",
  614. "ResultPath": null,
  615. "Next": "Start Forget Phase",
  616. "Catch": [{
  617. "ErrorEquals": ["States.ALL"],
  618. "ResultPath": "$.ErrorDetails",
  619. "Next": "Handle Error"
  620. }]
  621. },
  622. "Start Forget Phase": {
  623. "Type": "Task",
  624. "Parameters": {
  625. "EventName": "ForgetPhaseStarted",
  626. "EventData.$": "$$.State.EnteredTime",
  627. "EmitterId": "StepFunctions",
  628. "JobId.$": "$$.Execution.Name"
  629. },
  630. "Resource": "${EmitEvent.Arn}",
  631. "ResultPath": null,
  632. "Next": "Start Fargate Workflow",
  633. "Catch": [{
  634. "ErrorEquals": ["States.ALL"],
  635. "ResultPath": "$.ErrorDetails",
  636. "Next": "Handle Error"
  637. }]
  638. },
  639. "Start Fargate Workflow": {
  640. "Type":"Task",
  641. "Resource":"arn:${AWS::Partition}:states:::states:startExecution.sync",
  642. "Parameters":{
  643. "Input":{
  644. "AWS_STEP_FUNCTIONS_STARTED_BY_EXECUTION_ID.$": "$$.Execution.Id",
  645. "DeletionTasksMaxNumber.$": "$.DeletionTasksMaxNumber",
  646. "WaitDuration.$": "$.ForgetQueueWaitSeconds"
  647. },
  648. "StateMachineArn":"${DeleteStateMachine}",
  649. "Name.$": "$$.Execution.Name"
  650. },
  651. "ResultPath": null,
  652. "Next": "End Forget Phase",
  653. "Catch": [{
  654. "ErrorEquals": ["States.ALL"],
  655. "ResultPath": "$.ErrorDetails",
  656. "Next": "Handle Forget Error"
  657. }]
  658. },
  659. "End Forget Phase": {
  660. "Type": "Task",
  661. "Parameters": {
  662. "EventName": "ForgetPhaseEnded",
  663. "EventData.$": "$$.State.EnteredTime",
  664. "EmitterId": "StepFunctions",
  665. "JobId.$": "$$.Execution.Name"
  666. },
  667. "Resource": "${EmitEvent.Arn}",
  668. "ResultPath": null,
  669. "End": true
  670. },
  671. "Handle Error": {
  672. "Type": "Pass",
  673. "Parameters": {
  674. "EventName": "Exception",
  675. "Error.$": "$.ErrorDetails.Error",
  676. "Cause.$": "$.ErrorDetails.Cause",
  677. "State.$": "$"
  678. },
  679. "Next": "Emit Error"
  680. },
  681. "Handle Find Error": {
  682. "Type": "Pass",
  683. "Parameters": {
  684. "EventName": "FindPhaseFailed",
  685. "Error.$": "$.ErrorDetails.Error",
  686. "Cause.$": "$.ErrorDetails.Cause",
  687. "State.$": "$"
  688. },
  689. "Next": "Emit Error"
  690. },
  691. "Handle Forget Error": {
  692. "Type": "Pass",
  693. "Parameters": {
  694. "EventName": "ForgetPhaseFailed",
  695. "Error.$": "$.ErrorDetails.Error",
  696. "Cause.$": "$.ErrorDetails.Cause",
  697. "State.$": "$"
  698. },
  699. "Next": "Emit Error"
  700. },
  701. "Emit Error": {
  702. "Comment": "Emit the generic failure event",
  703. "Type": "Task",
  704. "Parameters": {
  705. "EventName.$": "$.EventName",
  706. "EventData.$": "$",
  707. "EmitterId": "StepFunctions",
  708. "JobId.$": "$$.Execution.Name"
  709. },
  710. "Resource": "${EmitEvent.Arn}",
  711. "ResultPath": null,
  712. "End": true
  713. }
  714. }
  715. }
  716. RoleArn: !GetAtt StatesExecutionRole.Arn
  717. # Supporting Resources
  718. QueryQueue:
  719. Type: AWS::SQS::Queue
  720. Properties:
  721. KmsMasterKeyId: alias/aws/sqs
  722. VisibilityTimeout: 43200
  723. # Tasks
  724. CheckQueueSize:
  725. Type: AWS::Serverless::Function
  726. Properties:
  727. Handler: check_queue_size.handler
  728. CodeUri: ../backend/lambdas/tasks/
  729. Policies:
  730. - Statement:
  731. - Action:
  732. - "sqs:GetQueueAttributes"
  733. Effect: "Allow"
  734. Resource:
  735. - !GetAtt QueryQueue.Arn
  736. - !Sub
  737. - arn:${AWS::Partition}:sqs:${AWS::Region}:${AWS::AccountId}:${QueueName}
  738. - QueueName: !Select [4, !Split ["/", !Ref DeleteQueueUrl]]
  739. CheckTaskCount:
  740. Type: AWS::Serverless::Function
  741. Properties:
  742. Handler: check_task_count.handler
  743. CodeUri: ../backend/lambdas/tasks/
  744. Policies:
  745. - Statement:
  746. - Action:
  747. - "ecs:DescribeServices"
  748. Effect: "Allow"
  749. Resource:
  750. - !Sub arn:${AWS::Partition}:ecs:${AWS::Region}:${AWS::AccountId}:service/${ECSCluster}/${DeleteServiceName}
  751. ExecuteQuery:
  752. Type: AWS::Serverless::Function
  753. Properties:
  754. Handler: execute_query.handler
  755. CodeUri: ../backend/lambdas/tasks/
  756. Environment:
  757. Variables:
  758. WorkGroup: !Ref AthenaWorkGroup
  759. Policies:
  760. - S3ReadPolicy:
  761. BucketName: !Ref ManifestsBucket
  762. - S3CrudPolicy:
  763. BucketName: !Ref ResultBucket
  764. - Statement:
  765. - Action:
  766. - "glue:BatchGetPartition"
  767. - "glue:GetDatabase*"
  768. - "glue:GetPartition*"
  769. - "glue:GetTable*"
  770. Effect: "Allow"
  771. Resource:
  772. - !Sub "arn:${AWS::Partition}:glue:*:*:catalog*"
  773. - !Sub "arn:${AWS::Partition}:glue:*:*:database*"
  774. - !Sub "arn:${AWS::Partition}:glue:*:*:table*"
  775. - !Sub "arn:${AWS::Partition}:glue:*:*:partition*"
  776. - Action:
  777. - "athena:StartQueryExecution"
  778. Effect: "Allow"
  779. Resource: !Sub "arn:${AWS::Partition}:athena:${AWS::Region}:${AWS::AccountId}:workgroup/${AthenaWorkGroup}"
  780. - Action:
  781. - "s3:GetObject*"
  782. - "s3:ListBucket*"
  783. - "lakeformation:GetDataAccess"
  784. Effect: "Allow"
  785. Resource: "*"
  786. Condition:
  787. ForAnyValue:StringEquals:
  788. aws:CalledVia:
  789. - !Sub athena.${AWS::URLSuffix}
  790. - Action:
  791. - "kms:Encrypt"
  792. - "kms:Decrypt"
  793. - "kms:ReEncrypt*"
  794. - "kms:GenerateDataKey*"
  795. - "kms:DescribeKey"
  796. Effect: "Allow"
  797. NotResource: !Sub "arn:${AWS::Partition}:kms:${AWS::Region}:${AWS::AccountId}:key/*"
  798. Condition:
  799. ForAnyValue:StringEquals:
  800. aws:CalledVia:
  801. - !Sub athena.${AWS::URLSuffix}
  802. CheckQueryStatus:
  803. Type: AWS::Serverless::Function
  804. Properties:
  805. Handler: check_query_status.handler
  806. CodeUri: ../backend/lambdas/tasks/
  807. Policies:
  808. - Statement:
  809. - Action:
  810. - "athena:GetQueryExecution"
  811. Effect: "Allow"
  812. Resource: !Sub "arn:${AWS::Partition}:athena:${AWS::Region}:${AWS::AccountId}:workgroup/${AthenaWorkGroup}"
  813. SubmitQueryResults:
  814. Type: AWS::Serverless::Function
  815. Properties:
  816. Handler: submit_query_results.handler
  817. CodeUri: ../backend/lambdas/tasks/
  818. MemorySize: 512
  819. Environment:
  820. Variables:
  821. QueueUrl: !Ref DeleteQueueUrl
  822. Policies:
  823. - S3ReadPolicy:
  824. BucketName: !Ref ResultBucket
  825. - Statement:
  826. - Action:
  827. - "athena:GetQueryResults"
  828. Effect: "Allow"
  829. Resource: !Sub "arn:${AWS::Partition}:athena:${AWS::Region}:${AWS::AccountId}:workgroup/${AthenaWorkGroup}"
  830. - Action:
  831. - "sqs:SendMessage"
  832. - "sqs:GetQueueAttributes"
  833. Effect: "Allow"
  834. Resource:
  835. - !GetAtt QueryQueue.Arn
  836. - !Sub
  837. - arn:${AWS::Partition}:sqs:${AWS::Region}:${AWS::AccountId}:${QueueName}
  838. - QueueName: !Select [4, !Split ["/", !Ref DeleteQueueUrl]]
  839. GenerateQueries:
  840. Type: AWS::Serverless::Function
  841. Properties:
  842. Handler: generate_queries.handler
  843. CodeUri: ../backend/lambdas/tasks/
  844. MemorySize: 512
  845. Environment:
  846. Variables:
  847. QueryQueue: !Ref QueryQueue
  848. Policies:
  849. - S3CrudPolicy:
  850. BucketName: !Ref ManifestsBucket
  851. - DynamoDBReadPolicy:
  852. TableName: !Ref JobTableName
  853. - DynamoDBReadPolicy:
  854. TableName: !Ref DataMapperTableName
  855. - DynamoDBReadPolicy:
  856. TableName: !Ref DeletionQueueTableName
  857. - Statement:
  858. - Action:
  859. - "glue:BatchGetPartition"
  860. - "glue:GetDatabase*"
  861. - "glue:GetPartition*"
  862. - "glue:GetTable*"
  863. Effect: "Allow"
  864. Resource:
  865. - !Sub "arn:${AWS::Partition}:glue:*:*:catalog*"
  866. - !Sub "arn:${AWS::Partition}:glue:*:*:database*"
  867. - !Sub "arn:${AWS::Partition}:glue:*:*:table*"
  868. - !Sub "arn:${AWS::Partition}:glue:*:*:partition*"
  869. - Action:
  870. - "glue:BatchCreatePartition"
  871. Effect: "Allow"
  872. Resource:
  873. - !Sub "arn:${AWS::Partition}:glue:*:*:catalog*"
  874. - !Sub "arn:${AWS::Partition}:glue:*:*:database/${GlueDatabase}"
  875. - !Sub "arn:${AWS::Partition}:glue:*:*:table/${GlueDatabase}/${JobManifestsGlueTable}"
  876. - Effect: Allow
  877. Action:
  878. - "sqs:SendMessage*"
  879. - "sqs:GetQueueAttributes"
  880. Resource:
  881. - !GetAtt QueryQueue.Arn
  882. OrchestrateECSServiceScaling:
  883. Type: AWS::Serverless::Function
  884. Properties:
  885. Handler: orchestrate_ecs_service_scaling.handler
  886. CodeUri: ../backend/lambdas/tasks/
  887. Policies:
  888. - Statement:
  889. - Action:
  890. - "ecs:UpdateService"
  891. Effect: "Allow"
  892. Resource: !Sub "arn:${AWS::Partition}:ecs:${AWS::Region}:${AWS::AccountId}:service/${ECSCluster}/${DeleteServiceName}"
  893. WorkQueryQueue:
  894. Type: AWS::Serverless::Function
  895. Properties:
  896. Handler: work_query_queue.handler
  897. CodeUri: ../backend/lambdas/tasks/
  898. Environment:
  899. Variables:
  900. StateMachineArn: !Sub "arn:${AWS::Partition}:states:${AWS::Region}:${AWS::AccountId}:stateMachine:${StateMachinePrefix}-AthenaStateMachine"
  901. QueueUrl: !Ref QueryQueue
  902. Policies:
  903. - S3CrudPolicy:
  904. BucketName: !Ref ResultBucket
  905. - Statement:
  906. - Action:
  907. - "states:StartExecution"
  908. - "states:DescribeExecution"
  909. Effect: Allow
  910. Resource:
  911. - !Sub "arn:${AWS::Partition}:states:${AWS::Region}:${AWS::AccountId}:stateMachine:${StateMachinePrefix}-AthenaStateMachine"
  912. - !Sub "arn:${AWS::Partition}:states:${AWS::Region}:${AWS::AccountId}:execution:${StateMachinePrefix}-AthenaStateMachine:*"
  913. - Action:
  914. - "sqs:ReceiveMessage*"
  915. - "sqs:ChangeMessageVisibility"
  916. - "sqs:PurgeQueue"
  917. - "sqs:DeleteMessage"
  918. - "sqs:GetQueueAttributes"
  919. Effect: "Allow"
  920. Resource:
  921. - !GetAtt QueryQueue.Arn
  922. DeleteQueueMessage:
  923. Type: AWS::Serverless::Function
  924. Properties:
  925. Handler: delete_message.handler
  926. CodeUri: ../backend/lambdas/tasks/
  927. Environment:
  928. Variables:
  929. QueueUrl: !Ref QueryQueue
  930. Policies:
  931. - Statement:
  932. - Action:
  933. - "sqs:DeleteMessage"
  934. - "sqs:GetQueueAttributes"
  935. Effect: "Allow"
  936. Resource: !GetAtt QueryQueue.Arn
  937. PurgeQueue:
  938. Type: AWS::Serverless::Function
  939. Properties:
  940. Handler: purge_queue.handler
  941. CodeUri: ../backend/lambdas/tasks/
  942. Policies:
  943. - Statement:
  944. - Action:
  945. - "sqs:PurgeQueue"
  946. - "sqs:GetQueueAttributes"
  947. Effect: "Allow"
  948. Resource:
  949. - !GetAtt QueryQueue.Arn
  950. - !Sub
  951. - arn:${AWS::Partition}:sqs:${AWS::Region}:${AWS::AccountId}:${QueueName}
  952. - QueueName: !Select [4, !Split ["/", !Ref DeleteQueueUrl]]
  953. EmitEvent:
  954. Type: AWS::Serverless::Function
  955. Properties:
  956. Handler: emit_event.handler
  957. CodeUri: ../backend/lambdas/tasks/
  958. Policies:
  959. - DynamoDBCrudPolicy:
  960. TableName: !Ref JobTableName
  961. EventRule:
  962. Type: AWS::Events::Rule
  963. Properties:
  964. Description: "Event rule for emitting unknown Step Functions failures"
  965. EventPattern:
  966. source:
  967. - "aws.states"
  968. detail:
  969. stateMachineArn:
  970. - !Sub arn:${AWS::Partition}:states:${AWS::Region}:${AWS::AccountId}:stateMachine:${StateMachinePrefix}-StateMachine
  971. status:
  972. - "ABORTED"
  973. - "TIMED_OUT"
  974. - "FAILED"
  975. State: "ENABLED"
  976. Targets:
  977. -
  978. Arn: !GetAtt EmitEvent.Arn
  979. Id: "TargetEmitEvent"
  980. InputTransformer:
  981. InputPathsMap:
  982. job_id: "$.detail.name"
  983. status: "$.detail.status"
  984. InputTemplate: '{"JobId": <job_id>, "EventName": "Exception", "EventData": {"Error": <status>, "Cause": "State Machine execution exited unexpectedly"}, "EmitterId": "CloudWatchEvents"}'
  985. PermissionForEventsToInvokeLambda:
  986. Type: AWS::Lambda::Permission
  987. Properties:
  988. FunctionName: !Ref EmitEvent
  989. Action: "lambda:InvokeFunction"
  990. Principal: !Sub "events.${AWS::URLSuffix}"
  991. SourceArn: !GetAtt EventRule.Arn
  992. Outputs:
  993. AthenaExecutionRole:
  994. Value: !Ref ExecuteQueryRole
  995. AthenaExecutionRoleArn:
  996. Value: !GetAtt ExecuteQueryRole.Arn
  997. AthenaStateMachineArn:
  998. Value: !Ref AthenaStateMachine
  999. GenerateQueriesRole:
  1000. Value: !Ref GenerateQueriesRole
  1001. QueryQueueUrl:
  1002. Value: !Ref QueryQueue
  1003. StateMachineArn:
  1004. Value: !Ref StateMachine
  1005. StateMachineRoleArn:
  1006. Value: !GetAtt StatesExecutionRole.Arn