##// END OF EJS Templates
automation: schedule an EC2Launch run on next boot...
Gregory Szorc -
r43528:c09e8ac3 default
parent child Browse files
Show More
@@ -1,1239 +1,1262 b''
1 1 # aws.py - Automation code for Amazon Web Services
2 2 #
3 3 # Copyright 2019 Gregory Szorc <gregory.szorc@gmail.com>
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 # no-check-code because Python 3 native.
9 9
10 10 import contextlib
11 11 import copy
12 12 import hashlib
13 13 import json
14 14 import os
15 15 import pathlib
16 16 import subprocess
17 17 import time
18 18
19 19 import boto3
20 20 import botocore.exceptions
21 21
22 22 from .linux import BOOTSTRAP_DEBIAN
23 23 from .ssh import (
24 24 exec_command as ssh_exec_command,
25 25 wait_for_ssh,
26 26 )
27 27 from .winrm import (
28 28 run_powershell,
29 29 wait_for_winrm,
30 30 )
31 31
32 32
33 33 SOURCE_ROOT = pathlib.Path(
34 34 os.path.abspath(__file__)
35 35 ).parent.parent.parent.parent
36 36
37 37 INSTALL_WINDOWS_DEPENDENCIES = (
38 38 SOURCE_ROOT / 'contrib' / 'install-windows-dependencies.ps1'
39 39 )
40 40
41 41
42 42 INSTANCE_TYPES_WITH_STORAGE = {
43 43 'c5d',
44 44 'd2',
45 45 'h1',
46 46 'i3',
47 47 'm5ad',
48 48 'm5d',
49 49 'r5d',
50 50 'r5ad',
51 51 'x1',
52 52 'z1d',
53 53 }
54 54
55 55
56 56 AMAZON_ACCOUNT_ID = '801119661308'
57 57 DEBIAN_ACCOUNT_ID = '379101102735'
58 58 DEBIAN_ACCOUNT_ID_2 = '136693071363'
59 59 UBUNTU_ACCOUNT_ID = '099720109477'
60 60
61 61
62 62 WINDOWS_BASE_IMAGE_NAME = 'Windows_Server-2019-English-Full-Base-2019.07.12'
63 63
64 64
65 65 KEY_PAIRS = {
66 66 'automation',
67 67 }
68 68
69 69
70 70 SECURITY_GROUPS = {
71 71 'linux-dev-1': {
72 72 'description': 'Mercurial Linux instances that perform build/test automation',
73 73 'ingress': [
74 74 {
75 75 'FromPort': 22,
76 76 'ToPort': 22,
77 77 'IpProtocol': 'tcp',
78 78 'IpRanges': [
79 79 {
80 80 'CidrIp': '0.0.0.0/0',
81 81 'Description': 'SSH from entire Internet',
82 82 },
83 83 ],
84 84 },
85 85 ],
86 86 },
87 87 'windows-dev-1': {
88 88 'description': 'Mercurial Windows instances that perform build automation',
89 89 'ingress': [
90 90 {
91 91 'FromPort': 22,
92 92 'ToPort': 22,
93 93 'IpProtocol': 'tcp',
94 94 'IpRanges': [
95 95 {
96 96 'CidrIp': '0.0.0.0/0',
97 97 'Description': 'SSH from entire Internet',
98 98 },
99 99 ],
100 100 },
101 101 {
102 102 'FromPort': 3389,
103 103 'ToPort': 3389,
104 104 'IpProtocol': 'tcp',
105 105 'IpRanges': [
106 106 {
107 107 'CidrIp': '0.0.0.0/0',
108 108 'Description': 'RDP from entire Internet',
109 109 },
110 110 ],
111 111 },
112 112 {
113 113 'FromPort': 5985,
114 114 'ToPort': 5986,
115 115 'IpProtocol': 'tcp',
116 116 'IpRanges': [
117 117 {
118 118 'CidrIp': '0.0.0.0/0',
119 119 'Description': 'PowerShell Remoting (Windows Remote Management)',
120 120 },
121 121 ],
122 122 },
123 123 ],
124 124 },
125 125 }
126 126
127 127
128 128 IAM_ROLES = {
129 129 'ephemeral-ec2-role-1': {
130 130 'description': 'Mercurial temporary EC2 instances',
131 131 'policy_arns': [
132 132 'arn:aws:iam::aws:policy/service-role/AmazonEC2RoleforSSM',
133 133 ],
134 134 },
135 135 }
136 136
137 137
138 138 ASSUME_ROLE_POLICY_DOCUMENT = '''
139 139 {
140 140 "Version": "2012-10-17",
141 141 "Statement": [
142 142 {
143 143 "Effect": "Allow",
144 144 "Principal": {
145 145 "Service": "ec2.amazonaws.com"
146 146 },
147 147 "Action": "sts:AssumeRole"
148 148 }
149 149 ]
150 150 }
151 151 '''.strip()
152 152
153 153
154 154 IAM_INSTANCE_PROFILES = {
155 155 'ephemeral-ec2-1': {'roles': ['ephemeral-ec2-role-1',],}
156 156 }
157 157
158 158
159 159 # User Data for Windows EC2 instance. Mainly used to set the password
160 160 # and configure WinRM.
161 161 # Inspired by the User Data script used by Packer
162 162 # (from https://www.packer.io/intro/getting-started/build-image.html).
163 163 WINDOWS_USER_DATA = r'''
164 164 <powershell>
165 165
166 166 # TODO enable this once we figure out what is failing.
167 167 #$ErrorActionPreference = "stop"
168 168
169 169 # Set administrator password
170 170 net user Administrator "%s"
171 171 wmic useraccount where "name='Administrator'" set PasswordExpires=FALSE
172 172
173 173 # First, make sure WinRM can't be connected to
174 174 netsh advfirewall firewall set rule name="Windows Remote Management (HTTP-In)" new enable=yes action=block
175 175
176 176 # Delete any existing WinRM listeners
177 177 winrm delete winrm/config/listener?Address=*+Transport=HTTP 2>$Null
178 178 winrm delete winrm/config/listener?Address=*+Transport=HTTPS 2>$Null
179 179
180 180 # Create a new WinRM listener and configure
181 181 winrm create winrm/config/listener?Address=*+Transport=HTTP
182 182 winrm set winrm/config/winrs '@{MaxMemoryPerShellMB="0"}'
183 183 winrm set winrm/config '@{MaxTimeoutms="7200000"}'
184 184 winrm set winrm/config/service '@{AllowUnencrypted="true"}'
185 185 winrm set winrm/config/service '@{MaxConcurrentOperationsPerUser="12000"}'
186 186 winrm set winrm/config/service/auth '@{Basic="true"}'
187 187 winrm set winrm/config/client/auth '@{Basic="true"}'
188 188
189 189 # Configure UAC to allow privilege elevation in remote shells
190 190 $Key = 'HKLM:\SOFTWARE\Microsoft\Windows\CurrentVersion\Policies\System'
191 191 $Setting = 'LocalAccountTokenFilterPolicy'
192 192 Set-ItemProperty -Path $Key -Name $Setting -Value 1 -Force
193 193
194 194 # Configure and restart the WinRM Service; Enable the required firewall exception
195 195 Stop-Service -Name WinRM
196 196 Set-Service -Name WinRM -StartupType Automatic
197 197 netsh advfirewall firewall set rule name="Windows Remote Management (HTTP-In)" new action=allow localip=any remoteip=any
198 198 Start-Service -Name WinRM
199 199
200 200 # Disable firewall on private network interfaces so prompts don't appear.
201 201 Set-NetFirewallProfile -Name private -Enabled false
202 202 </powershell>
203 203 '''.lstrip()
204 204
205 205
206 206 WINDOWS_BOOTSTRAP_POWERSHELL = '''
207 207 Write-Output "installing PowerShell dependencies"
208 208 Install-PackageProvider -Name NuGet -MinimumVersion 2.8.5.201 -Force
209 209 Set-PSRepository -Name PSGallery -InstallationPolicy Trusted
210 210 Install-Module -Name OpenSSHUtils -RequiredVersion 0.0.2.0
211 211
212 212 Write-Output "installing OpenSSL server"
213 213 Add-WindowsCapability -Online -Name OpenSSH.Server~~~~0.0.1.0
214 214 # Various tools will attempt to use older versions of .NET. So we enable
215 215 # the feature that provides them so it doesn't have to be auto-enabled
216 216 # later.
217 217 Write-Output "enabling .NET Framework feature"
218 218 Install-WindowsFeature -Name Net-Framework-Core
219 219 '''
220 220
221 221
222 222 class AWSConnection:
223 223 """Manages the state of a connection with AWS."""
224 224
225 225 def __init__(self, automation, region: str, ensure_ec2_state: bool = True):
226 226 self.automation = automation
227 227 self.local_state_path = automation.state_path
228 228
229 229 self.prefix = 'hg-'
230 230
231 231 self.session = boto3.session.Session(region_name=region)
232 232 self.ec2client = self.session.client('ec2')
233 233 self.ec2resource = self.session.resource('ec2')
234 234 self.iamclient = self.session.client('iam')
235 235 self.iamresource = self.session.resource('iam')
236 236 self.security_groups = {}
237 237
238 238 if ensure_ec2_state:
239 239 ensure_key_pairs(automation.state_path, self.ec2resource)
240 240 self.security_groups = ensure_security_groups(self.ec2resource)
241 241 ensure_iam_state(self.iamclient, self.iamresource)
242 242
243 243 def key_pair_path_private(self, name):
244 244 """Path to a key pair private key file."""
245 245 return self.local_state_path / 'keys' / ('keypair-%s' % name)
246 246
247 247 def key_pair_path_public(self, name):
248 248 return self.local_state_path / 'keys' / ('keypair-%s.pub' % name)
249 249
250 250
251 251 def rsa_key_fingerprint(p: pathlib.Path):
252 252 """Compute the fingerprint of an RSA private key."""
253 253
254 254 # TODO use rsa package.
255 255 res = subprocess.run(
256 256 [
257 257 'openssl',
258 258 'pkcs8',
259 259 '-in',
260 260 str(p),
261 261 '-nocrypt',
262 262 '-topk8',
263 263 '-outform',
264 264 'DER',
265 265 ],
266 266 capture_output=True,
267 267 check=True,
268 268 )
269 269
270 270 sha1 = hashlib.sha1(res.stdout).hexdigest()
271 271 return ':'.join(a + b for a, b in zip(sha1[::2], sha1[1::2]))
272 272
273 273
274 274 def ensure_key_pairs(state_path: pathlib.Path, ec2resource, prefix='hg-'):
275 275 remote_existing = {}
276 276
277 277 for kpi in ec2resource.key_pairs.all():
278 278 if kpi.name.startswith(prefix):
279 279 remote_existing[kpi.name[len(prefix) :]] = kpi.key_fingerprint
280 280
281 281 # Validate that we have these keys locally.
282 282 key_path = state_path / 'keys'
283 283 key_path.mkdir(exist_ok=True, mode=0o700)
284 284
285 285 def remove_remote(name):
286 286 print('deleting key pair %s' % name)
287 287 key = ec2resource.KeyPair(name)
288 288 key.delete()
289 289
290 290 def remove_local(name):
291 291 pub_full = key_path / ('keypair-%s.pub' % name)
292 292 priv_full = key_path / ('keypair-%s' % name)
293 293
294 294 print('removing %s' % pub_full)
295 295 pub_full.unlink()
296 296 print('removing %s' % priv_full)
297 297 priv_full.unlink()
298 298
299 299 local_existing = {}
300 300
301 301 for f in sorted(os.listdir(key_path)):
302 302 if not f.startswith('keypair-') or not f.endswith('.pub'):
303 303 continue
304 304
305 305 name = f[len('keypair-') : -len('.pub')]
306 306
307 307 pub_full = key_path / f
308 308 priv_full = key_path / ('keypair-%s' % name)
309 309
310 310 with open(pub_full, 'r', encoding='ascii') as fh:
311 311 data = fh.read()
312 312
313 313 if not data.startswith('ssh-rsa '):
314 314 print(
315 315 'unexpected format for key pair file: %s; removing' % pub_full
316 316 )
317 317 pub_full.unlink()
318 318 priv_full.unlink()
319 319 continue
320 320
321 321 local_existing[name] = rsa_key_fingerprint(priv_full)
322 322
323 323 for name in sorted(set(remote_existing) | set(local_existing)):
324 324 if name not in local_existing:
325 325 actual = '%s%s' % (prefix, name)
326 326 print('remote key %s does not exist locally' % name)
327 327 remove_remote(actual)
328 328 del remote_existing[name]
329 329
330 330 elif name not in remote_existing:
331 331 print('local key %s does not exist remotely' % name)
332 332 remove_local(name)
333 333 del local_existing[name]
334 334
335 335 elif remote_existing[name] != local_existing[name]:
336 336 print(
337 337 'key fingerprint mismatch for %s; '
338 338 'removing from local and remote' % name
339 339 )
340 340 remove_local(name)
341 341 remove_remote('%s%s' % (prefix, name))
342 342 del local_existing[name]
343 343 del remote_existing[name]
344 344
345 345 missing = KEY_PAIRS - set(remote_existing)
346 346
347 347 for name in sorted(missing):
348 348 actual = '%s%s' % (prefix, name)
349 349 print('creating key pair %s' % actual)
350 350
351 351 priv_full = key_path / ('keypair-%s' % name)
352 352 pub_full = key_path / ('keypair-%s.pub' % name)
353 353
354 354 kp = ec2resource.create_key_pair(KeyName=actual)
355 355
356 356 with priv_full.open('w', encoding='ascii') as fh:
357 357 fh.write(kp.key_material)
358 358 fh.write('\n')
359 359
360 360 priv_full.chmod(0o0600)
361 361
362 362 # SSH public key can be extracted via `ssh-keygen`.
363 363 with pub_full.open('w', encoding='ascii') as fh:
364 364 subprocess.run(
365 365 ['ssh-keygen', '-y', '-f', str(priv_full)],
366 366 stdout=fh,
367 367 check=True,
368 368 )
369 369
370 370 pub_full.chmod(0o0600)
371 371
372 372
373 373 def delete_instance_profile(profile):
374 374 for role in profile.roles:
375 375 print(
376 376 'removing role %s from instance profile %s'
377 377 % (role.name, profile.name)
378 378 )
379 379 profile.remove_role(RoleName=role.name)
380 380
381 381 print('deleting instance profile %s' % profile.name)
382 382 profile.delete()
383 383
384 384
385 385 def ensure_iam_state(iamclient, iamresource, prefix='hg-'):
386 386 """Ensure IAM state is in sync with our canonical definition."""
387 387
388 388 remote_profiles = {}
389 389
390 390 for profile in iamresource.instance_profiles.all():
391 391 if profile.name.startswith(prefix):
392 392 remote_profiles[profile.name[len(prefix) :]] = profile
393 393
394 394 for name in sorted(set(remote_profiles) - set(IAM_INSTANCE_PROFILES)):
395 395 delete_instance_profile(remote_profiles[name])
396 396 del remote_profiles[name]
397 397
398 398 remote_roles = {}
399 399
400 400 for role in iamresource.roles.all():
401 401 if role.name.startswith(prefix):
402 402 remote_roles[role.name[len(prefix) :]] = role
403 403
404 404 for name in sorted(set(remote_roles) - set(IAM_ROLES)):
405 405 role = remote_roles[name]
406 406
407 407 print('removing role %s' % role.name)
408 408 role.delete()
409 409 del remote_roles[name]
410 410
411 411 # We've purged remote state that doesn't belong. Create missing
412 412 # instance profiles and roles.
413 413 for name in sorted(set(IAM_INSTANCE_PROFILES) - set(remote_profiles)):
414 414 actual = '%s%s' % (prefix, name)
415 415 print('creating IAM instance profile %s' % actual)
416 416
417 417 profile = iamresource.create_instance_profile(
418 418 InstanceProfileName=actual
419 419 )
420 420 remote_profiles[name] = profile
421 421
422 422 waiter = iamclient.get_waiter('instance_profile_exists')
423 423 waiter.wait(InstanceProfileName=actual)
424 424 print('IAM instance profile %s is available' % actual)
425 425
426 426 for name in sorted(set(IAM_ROLES) - set(remote_roles)):
427 427 entry = IAM_ROLES[name]
428 428
429 429 actual = '%s%s' % (prefix, name)
430 430 print('creating IAM role %s' % actual)
431 431
432 432 role = iamresource.create_role(
433 433 RoleName=actual,
434 434 Description=entry['description'],
435 435 AssumeRolePolicyDocument=ASSUME_ROLE_POLICY_DOCUMENT,
436 436 )
437 437
438 438 waiter = iamclient.get_waiter('role_exists')
439 439 waiter.wait(RoleName=actual)
440 440 print('IAM role %s is available' % actual)
441 441
442 442 remote_roles[name] = role
443 443
444 444 for arn in entry['policy_arns']:
445 445 print('attaching policy %s to %s' % (arn, role.name))
446 446 role.attach_policy(PolicyArn=arn)
447 447
448 448 # Now reconcile state of profiles.
449 449 for name, meta in sorted(IAM_INSTANCE_PROFILES.items()):
450 450 profile = remote_profiles[name]
451 451 wanted = {'%s%s' % (prefix, role) for role in meta['roles']}
452 452 have = {role.name for role in profile.roles}
453 453
454 454 for role in sorted(have - wanted):
455 455 print('removing role %s from %s' % (role, profile.name))
456 456 profile.remove_role(RoleName=role)
457 457
458 458 for role in sorted(wanted - have):
459 459 print('adding role %s to %s' % (role, profile.name))
460 460 profile.add_role(RoleName=role)
461 461
462 462
463 463 def find_image(ec2resource, owner_id, name):
464 464 """Find an AMI by its owner ID and name."""
465 465
466 466 images = ec2resource.images.filter(
467 467 Filters=[
468 468 {'Name': 'owner-id', 'Values': [owner_id],},
469 469 {'Name': 'state', 'Values': ['available'],},
470 470 {'Name': 'image-type', 'Values': ['machine'],},
471 471 {'Name': 'name', 'Values': [name],},
472 472 ]
473 473 )
474 474
475 475 for image in images:
476 476 return image
477 477
478 478 raise Exception('unable to find image for %s' % name)
479 479
480 480
481 481 def ensure_security_groups(ec2resource, prefix='hg-'):
482 482 """Ensure all necessary Mercurial security groups are present.
483 483
484 484 All security groups are prefixed with ``hg-`` by default. Any security
485 485 groups having this prefix but aren't in our list are deleted.
486 486 """
487 487 existing = {}
488 488
489 489 for group in ec2resource.security_groups.all():
490 490 if group.group_name.startswith(prefix):
491 491 existing[group.group_name[len(prefix) :]] = group
492 492
493 493 purge = set(existing) - set(SECURITY_GROUPS)
494 494
495 495 for name in sorted(purge):
496 496 group = existing[name]
497 497 print('removing legacy security group: %s' % group.group_name)
498 498 group.delete()
499 499
500 500 security_groups = {}
501 501
502 502 for name, group in sorted(SECURITY_GROUPS.items()):
503 503 if name in existing:
504 504 security_groups[name] = existing[name]
505 505 continue
506 506
507 507 actual = '%s%s' % (prefix, name)
508 508 print('adding security group %s' % actual)
509 509
510 510 group_res = ec2resource.create_security_group(
511 511 Description=group['description'], GroupName=actual,
512 512 )
513 513
514 514 group_res.authorize_ingress(IpPermissions=group['ingress'],)
515 515
516 516 security_groups[name] = group_res
517 517
518 518 return security_groups
519 519
520 520
521 521 def terminate_ec2_instances(ec2resource, prefix='hg-'):
522 522 """Terminate all EC2 instances managed by us."""
523 523 waiting = []
524 524
525 525 for instance in ec2resource.instances.all():
526 526 if instance.state['Name'] == 'terminated':
527 527 continue
528 528
529 529 for tag in instance.tags or []:
530 530 if tag['Key'] == 'Name' and tag['Value'].startswith(prefix):
531 531 print('terminating %s' % instance.id)
532 532 instance.terminate()
533 533 waiting.append(instance)
534 534
535 535 for instance in waiting:
536 536 instance.wait_until_terminated()
537 537
538 538
539 539 def remove_resources(c, prefix='hg-'):
540 540 """Purge all of our resources in this EC2 region."""
541 541 ec2resource = c.ec2resource
542 542 iamresource = c.iamresource
543 543
544 544 terminate_ec2_instances(ec2resource, prefix=prefix)
545 545
546 546 for image in ec2resource.images.filter(Owners=['self']):
547 547 if image.name.startswith(prefix):
548 548 remove_ami(ec2resource, image)
549 549
550 550 for group in ec2resource.security_groups.all():
551 551 if group.group_name.startswith(prefix):
552 552 print('removing security group %s' % group.group_name)
553 553 group.delete()
554 554
555 555 for profile in iamresource.instance_profiles.all():
556 556 if profile.name.startswith(prefix):
557 557 delete_instance_profile(profile)
558 558
559 559 for role in iamresource.roles.all():
560 560 if role.name.startswith(prefix):
561 561 for p in role.attached_policies.all():
562 562 print('detaching policy %s from %s' % (p.arn, role.name))
563 563 role.detach_policy(PolicyArn=p.arn)
564 564
565 565 print('removing role %s' % role.name)
566 566 role.delete()
567 567
568 568
569 569 def wait_for_ip_addresses(instances):
570 570 """Wait for the public IP addresses of an iterable of instances."""
571 571 for instance in instances:
572 572 while True:
573 573 if not instance.public_ip_address:
574 574 time.sleep(2)
575 575 instance.reload()
576 576 continue
577 577
578 578 print(
579 579 'public IP address for %s: %s'
580 580 % (instance.id, instance.public_ip_address)
581 581 )
582 582 break
583 583
584 584
585 585 def remove_ami(ec2resource, image):
586 586 """Remove an AMI and its underlying snapshots."""
587 587 snapshots = []
588 588
589 589 for device in image.block_device_mappings:
590 590 if 'Ebs' in device:
591 591 snapshots.append(ec2resource.Snapshot(device['Ebs']['SnapshotId']))
592 592
593 593 print('deregistering %s' % image.id)
594 594 image.deregister()
595 595
596 596 for snapshot in snapshots:
597 597 print('deleting snapshot %s' % snapshot.id)
598 598 snapshot.delete()
599 599
600 600
601 601 def wait_for_ssm(ssmclient, instances):
602 602 """Wait for SSM to come online for an iterable of instance IDs."""
603 603 while True:
604 604 res = ssmclient.describe_instance_information(
605 605 Filters=[
606 606 {'Key': 'InstanceIds', 'Values': [i.id for i in instances],},
607 607 ],
608 608 )
609 609
610 610 available = len(res['InstanceInformationList'])
611 611 wanted = len(instances)
612 612
613 613 print('%d/%d instances available in SSM' % (available, wanted))
614 614
615 615 if available == wanted:
616 616 return
617 617
618 618 time.sleep(2)
619 619
620 620
621 621 def run_ssm_command(ssmclient, instances, document_name, parameters):
622 622 """Run a PowerShell script on an EC2 instance."""
623 623
624 624 res = ssmclient.send_command(
625 625 InstanceIds=[i.id for i in instances],
626 626 DocumentName=document_name,
627 627 Parameters=parameters,
628 628 CloudWatchOutputConfig={'CloudWatchOutputEnabled': True,},
629 629 )
630 630
631 631 command_id = res['Command']['CommandId']
632 632
633 633 for instance in instances:
634 634 while True:
635 635 try:
636 636 res = ssmclient.get_command_invocation(
637 637 CommandId=command_id, InstanceId=instance.id,
638 638 )
639 639 except botocore.exceptions.ClientError as e:
640 640 if e.response['Error']['Code'] == 'InvocationDoesNotExist':
641 641 print('could not find SSM command invocation; waiting')
642 642 time.sleep(1)
643 643 continue
644 644 else:
645 645 raise
646 646
647 647 if res['Status'] == 'Success':
648 648 break
649 649 elif res['Status'] in ('Pending', 'InProgress', 'Delayed'):
650 650 time.sleep(2)
651 651 else:
652 652 raise Exception(
653 653 'command failed on %s: %s' % (instance.id, res['Status'])
654 654 )
655 655
656 656
657 657 @contextlib.contextmanager
658 658 def temporary_ec2_instances(ec2resource, config):
659 659 """Create temporary EC2 instances.
660 660
661 661 This is a proxy to ``ec2client.run_instances(**config)`` that takes care of
662 662 managing the lifecycle of the instances.
663 663
664 664 When the context manager exits, the instances are terminated.
665 665
666 666 The context manager evaluates to the list of data structures
667 667 describing each created instance. The instances may not be available
668 668 for work immediately: it is up to the caller to wait for the instance
669 669 to start responding.
670 670 """
671 671
672 672 ids = None
673 673
674 674 try:
675 675 res = ec2resource.create_instances(**config)
676 676
677 677 ids = [i.id for i in res]
678 678 print('started instances: %s' % ' '.join(ids))
679 679
680 680 yield res
681 681 finally:
682 682 if ids:
683 683 print('terminating instances: %s' % ' '.join(ids))
684 684 for instance in res:
685 685 instance.terminate()
686 686 print('terminated %d instances' % len(ids))
687 687
688 688
689 689 @contextlib.contextmanager
690 def create_temp_windows_ec2_instances(c: AWSConnection, config):
690 def create_temp_windows_ec2_instances(
691 c: AWSConnection, config, bootstrap: bool = False
692 ):
691 693 """Create temporary Windows EC2 instances.
692 694
693 695 This is a higher-level wrapper around ``create_temp_ec2_instances()`` that
694 696 configures the Windows instance for Windows Remote Management. The emitted
695 697 instances will have a ``winrm_client`` attribute containing a
696 698 ``pypsrp.client.Client`` instance bound to the instance.
697 699 """
698 700 if 'IamInstanceProfile' in config:
699 701 raise ValueError('IamInstanceProfile cannot be provided in config')
700 702 if 'UserData' in config:
701 703 raise ValueError('UserData cannot be provided in config')
702 704
703 705 password = c.automation.default_password()
704 706
705 707 config = copy.deepcopy(config)
706 708 config['IamInstanceProfile'] = {
707 709 'Name': 'hg-ephemeral-ec2-1',
708 710 }
709 711 config.setdefault('TagSpecifications', []).append(
710 712 {
711 713 'ResourceType': 'instance',
712 714 'Tags': [{'Key': 'Name', 'Value': 'hg-temp-windows'}],
713 715 }
714 716 )
715 config['UserData'] = WINDOWS_USER_DATA % password
717
718 if bootstrap:
719 config['UserData'] = WINDOWS_USER_DATA % password
716 720
717 721 with temporary_ec2_instances(c.ec2resource, config) as instances:
718 722 wait_for_ip_addresses(instances)
719 723
720 724 print('waiting for Windows Remote Management service...')
721 725
722 726 for instance in instances:
723 727 client = wait_for_winrm(
724 728 instance.public_ip_address, 'Administrator', password
725 729 )
726 730 print('established WinRM connection to %s' % instance.id)
727 731 instance.winrm_client = client
728 732
729 733 yield instances
730 734
731 735
732 736 def resolve_fingerprint(fingerprint):
733 737 fingerprint = json.dumps(fingerprint, sort_keys=True)
734 738 return hashlib.sha256(fingerprint.encode('utf-8')).hexdigest()
735 739
736 740
737 741 def find_and_reconcile_image(ec2resource, name, fingerprint):
738 742 """Attempt to find an existing EC2 AMI with a name and fingerprint.
739 743
740 744 If an image with the specified fingerprint is found, it is returned.
741 745 Otherwise None is returned.
742 746
743 747 Existing images for the specified name that don't have the specified
744 748 fingerprint or are missing required metadata or deleted.
745 749 """
746 750 # Find existing AMIs with this name and delete the ones that are invalid.
747 751 # Store a reference to a good image so it can be returned one the
748 752 # image state is reconciled.
749 753 images = ec2resource.images.filter(
750 754 Filters=[{'Name': 'name', 'Values': [name]}]
751 755 )
752 756
753 757 existing_image = None
754 758
755 759 for image in images:
756 760 if image.tags is None:
757 761 print(
758 762 'image %s for %s lacks required tags; removing'
759 763 % (image.id, image.name)
760 764 )
761 765 remove_ami(ec2resource, image)
762 766 else:
763 767 tags = {t['Key']: t['Value'] for t in image.tags}
764 768
765 769 if tags.get('HGIMAGEFINGERPRINT') == fingerprint:
766 770 existing_image = image
767 771 else:
768 772 print(
769 773 'image %s for %s has wrong fingerprint; removing'
770 774 % (image.id, image.name)
771 775 )
772 776 remove_ami(ec2resource, image)
773 777
774 778 return existing_image
775 779
776 780
777 781 def create_ami_from_instance(
778 782 ec2client, instance, name, description, fingerprint
779 783 ):
780 784 """Create an AMI from a running instance.
781 785
782 786 Returns the ``ec2resource.Image`` representing the created AMI.
783 787 """
784 788 instance.stop()
785 789
786 790 ec2client.get_waiter('instance_stopped').wait(
787 791 InstanceIds=[instance.id], WaiterConfig={'Delay': 5,}
788 792 )
789 793 print('%s is stopped' % instance.id)
790 794
791 795 image = instance.create_image(Name=name, Description=description,)
792 796
793 797 image.create_tags(
794 798 Tags=[{'Key': 'HGIMAGEFINGERPRINT', 'Value': fingerprint,},]
795 799 )
796 800
797 801 print('waiting for image %s' % image.id)
798 802
799 803 ec2client.get_waiter('image_available').wait(ImageIds=[image.id],)
800 804
801 805 print('image %s available as %s' % (image.id, image.name))
802 806
803 807 return image
804 808
805 809
806 810 def ensure_linux_dev_ami(c: AWSConnection, distro='debian10', prefix='hg-'):
807 811 """Ensures a Linux development AMI is available and up-to-date.
808 812
809 813 Returns an ``ec2.Image`` of either an existing AMI or a newly-built one.
810 814 """
811 815 ec2client = c.ec2client
812 816 ec2resource = c.ec2resource
813 817
814 818 name = '%s%s-%s' % (prefix, 'linux-dev', distro)
815 819
816 820 if distro == 'debian9':
817 821 image = find_image(
818 822 ec2resource,
819 823 DEBIAN_ACCOUNT_ID,
820 824 'debian-stretch-hvm-x86_64-gp2-2019-09-08-17994',
821 825 )
822 826 ssh_username = 'admin'
823 827 elif distro == 'debian10':
824 828 image = find_image(
825 829 ec2resource, DEBIAN_ACCOUNT_ID_2, 'debian-10-amd64-20190909-10',
826 830 )
827 831 ssh_username = 'admin'
828 832 elif distro == 'ubuntu18.04':
829 833 image = find_image(
830 834 ec2resource,
831 835 UBUNTU_ACCOUNT_ID,
832 836 'ubuntu/images/hvm-ssd/ubuntu-bionic-18.04-amd64-server-20190918',
833 837 )
834 838 ssh_username = 'ubuntu'
835 839 elif distro == 'ubuntu19.04':
836 840 image = find_image(
837 841 ec2resource,
838 842 UBUNTU_ACCOUNT_ID,
839 843 'ubuntu/images/hvm-ssd/ubuntu-disco-19.04-amd64-server-20190918',
840 844 )
841 845 ssh_username = 'ubuntu'
842 846 else:
843 847 raise ValueError('unsupported Linux distro: %s' % distro)
844 848
845 849 config = {
846 850 'BlockDeviceMappings': [
847 851 {
848 852 'DeviceName': image.block_device_mappings[0]['DeviceName'],
849 853 'Ebs': {
850 854 'DeleteOnTermination': True,
851 855 'VolumeSize': 10,
852 856 'VolumeType': 'gp2',
853 857 },
854 858 },
855 859 ],
856 860 'EbsOptimized': True,
857 861 'ImageId': image.id,
858 862 'InstanceInitiatedShutdownBehavior': 'stop',
859 863 # 8 VCPUs for compiling Python.
860 864 'InstanceType': 't3.2xlarge',
861 865 'KeyName': '%sautomation' % prefix,
862 866 'MaxCount': 1,
863 867 'MinCount': 1,
864 868 'SecurityGroupIds': [c.security_groups['linux-dev-1'].id],
865 869 }
866 870
867 871 requirements2_path = (
868 872 pathlib.Path(__file__).parent.parent / 'linux-requirements-py2.txt'
869 873 )
870 874 requirements3_path = (
871 875 pathlib.Path(__file__).parent.parent / 'linux-requirements-py3.txt'
872 876 )
873 877 with requirements2_path.open('r', encoding='utf-8') as fh:
874 878 requirements2 = fh.read()
875 879 with requirements3_path.open('r', encoding='utf-8') as fh:
876 880 requirements3 = fh.read()
877 881
878 882 # Compute a deterministic fingerprint to determine whether image needs to
879 883 # be regenerated.
880 884 fingerprint = resolve_fingerprint(
881 885 {
882 886 'instance_config': config,
883 887 'bootstrap_script': BOOTSTRAP_DEBIAN,
884 888 'requirements_py2': requirements2,
885 889 'requirements_py3': requirements3,
886 890 }
887 891 )
888 892
889 893 existing_image = find_and_reconcile_image(ec2resource, name, fingerprint)
890 894
891 895 if existing_image:
892 896 return existing_image
893 897
894 898 print('no suitable %s image found; creating one...' % name)
895 899
896 900 with temporary_ec2_instances(ec2resource, config) as instances:
897 901 wait_for_ip_addresses(instances)
898 902
899 903 instance = instances[0]
900 904
901 905 client = wait_for_ssh(
902 906 instance.public_ip_address,
903 907 22,
904 908 username=ssh_username,
905 909 key_filename=str(c.key_pair_path_private('automation')),
906 910 )
907 911
908 912 home = '/home/%s' % ssh_username
909 913
910 914 with client:
911 915 print('connecting to SSH server')
912 916 sftp = client.open_sftp()
913 917
914 918 print('uploading bootstrap files')
915 919 with sftp.open('%s/bootstrap' % home, 'wb') as fh:
916 920 fh.write(BOOTSTRAP_DEBIAN)
917 921 fh.chmod(0o0700)
918 922
919 923 with sftp.open('%s/requirements-py2.txt' % home, 'wb') as fh:
920 924 fh.write(requirements2)
921 925 fh.chmod(0o0700)
922 926
923 927 with sftp.open('%s/requirements-py3.txt' % home, 'wb') as fh:
924 928 fh.write(requirements3)
925 929 fh.chmod(0o0700)
926 930
927 931 print('executing bootstrap')
928 932 chan, stdin, stdout = ssh_exec_command(
929 933 client, '%s/bootstrap' % home
930 934 )
931 935 stdin.close()
932 936
933 937 for line in stdout:
934 938 print(line, end='')
935 939
936 940 res = chan.recv_exit_status()
937 941 if res:
938 942 raise Exception('non-0 exit from bootstrap: %d' % res)
939 943
940 944 print(
941 945 'bootstrap completed; stopping %s to create %s'
942 946 % (instance.id, name)
943 947 )
944 948
945 949 return create_ami_from_instance(
946 950 ec2client,
947 951 instance,
948 952 name,
949 953 'Mercurial Linux development environment',
950 954 fingerprint,
951 955 )
952 956
953 957
954 958 @contextlib.contextmanager
955 959 def temporary_linux_dev_instances(
956 960 c: AWSConnection,
957 961 image,
958 962 instance_type,
959 963 prefix='hg-',
960 964 ensure_extra_volume=False,
961 965 ):
962 966 """Create temporary Linux development EC2 instances.
963 967
964 968 Context manager resolves to a list of ``ec2.Instance`` that were created
965 969 and are running.
966 970
967 971 ``ensure_extra_volume`` can be set to ``True`` to require that instances
968 972 have a 2nd storage volume available other than the primary AMI volume.
969 973 For instance types with instance storage, this does nothing special.
970 974 But for instance types without instance storage, an additional EBS volume
971 975 will be added to the instance.
972 976
973 977 Instances have an ``ssh_client`` attribute containing a paramiko SSHClient
974 978 instance bound to the instance.
975 979
976 980 Instances have an ``ssh_private_key_path`` attributing containing the
977 981 str path to the SSH private key to connect to the instance.
978 982 """
979 983
980 984 block_device_mappings = [
981 985 {
982 986 'DeviceName': image.block_device_mappings[0]['DeviceName'],
983 987 'Ebs': {
984 988 'DeleteOnTermination': True,
985 989 'VolumeSize': 12,
986 990 'VolumeType': 'gp2',
987 991 },
988 992 }
989 993 ]
990 994
991 995 # This is not an exhaustive list of instance types having instance storage.
992 996 # But
993 997 if ensure_extra_volume and not instance_type.startswith(
994 998 tuple(INSTANCE_TYPES_WITH_STORAGE)
995 999 ):
996 1000 main_device = block_device_mappings[0]['DeviceName']
997 1001
998 1002 if main_device == 'xvda':
999 1003 second_device = 'xvdb'
1000 1004 elif main_device == '/dev/sda1':
1001 1005 second_device = '/dev/sdb'
1002 1006 else:
1003 1007 raise ValueError(
1004 1008 'unhandled primary EBS device name: %s' % main_device
1005 1009 )
1006 1010
1007 1011 block_device_mappings.append(
1008 1012 {
1009 1013 'DeviceName': second_device,
1010 1014 'Ebs': {
1011 1015 'DeleteOnTermination': True,
1012 1016 'VolumeSize': 8,
1013 1017 'VolumeType': 'gp2',
1014 1018 },
1015 1019 }
1016 1020 )
1017 1021
1018 1022 config = {
1019 1023 'BlockDeviceMappings': block_device_mappings,
1020 1024 'EbsOptimized': True,
1021 1025 'ImageId': image.id,
1022 1026 'InstanceInitiatedShutdownBehavior': 'terminate',
1023 1027 'InstanceType': instance_type,
1024 1028 'KeyName': '%sautomation' % prefix,
1025 1029 'MaxCount': 1,
1026 1030 'MinCount': 1,
1027 1031 'SecurityGroupIds': [c.security_groups['linux-dev-1'].id],
1028 1032 }
1029 1033
1030 1034 with temporary_ec2_instances(c.ec2resource, config) as instances:
1031 1035 wait_for_ip_addresses(instances)
1032 1036
1033 1037 ssh_private_key_path = str(c.key_pair_path_private('automation'))
1034 1038
1035 1039 for instance in instances:
1036 1040 client = wait_for_ssh(
1037 1041 instance.public_ip_address,
1038 1042 22,
1039 1043 username='hg',
1040 1044 key_filename=ssh_private_key_path,
1041 1045 )
1042 1046
1043 1047 instance.ssh_client = client
1044 1048 instance.ssh_private_key_path = ssh_private_key_path
1045 1049
1046 1050 try:
1047 1051 yield instances
1048 1052 finally:
1049 1053 for instance in instances:
1050 1054 instance.ssh_client.close()
1051 1055
1052 1056
1053 1057 def ensure_windows_dev_ami(
1054 1058 c: AWSConnection, prefix='hg-', base_image_name=WINDOWS_BASE_IMAGE_NAME
1055 1059 ):
1056 1060 """Ensure Windows Development AMI is available and up-to-date.
1057 1061
1058 1062 If necessary, a modern AMI will be built by starting a temporary EC2
1059 1063 instance and bootstrapping it.
1060 1064
1061 1065 Obsolete AMIs will be deleted so there is only a single AMI having the
1062 1066 desired name.
1063 1067
1064 1068 Returns an ``ec2.Image`` of either an existing AMI or a newly-built
1065 1069 one.
1066 1070 """
1067 1071 ec2client = c.ec2client
1068 1072 ec2resource = c.ec2resource
1069 1073 ssmclient = c.session.client('ssm')
1070 1074
1071 1075 name = '%s%s' % (prefix, 'windows-dev')
1072 1076
1073 1077 image = find_image(ec2resource, AMAZON_ACCOUNT_ID, base_image_name)
1074 1078
1075 1079 config = {
1076 1080 'BlockDeviceMappings': [
1077 1081 {
1078 1082 'DeviceName': '/dev/sda1',
1079 1083 'Ebs': {
1080 1084 'DeleteOnTermination': True,
1081 1085 'VolumeSize': 32,
1082 1086 'VolumeType': 'gp2',
1083 1087 },
1084 1088 }
1085 1089 ],
1086 1090 'ImageId': image.id,
1087 1091 'InstanceInitiatedShutdownBehavior': 'stop',
1088 1092 'InstanceType': 't3.medium',
1089 1093 'KeyName': '%sautomation' % prefix,
1090 1094 'MaxCount': 1,
1091 1095 'MinCount': 1,
1092 1096 'SecurityGroupIds': [c.security_groups['windows-dev-1'].id],
1093 1097 }
1094 1098
1095 1099 commands = [
1096 1100 # Need to start the service so sshd_config is generated.
1097 1101 'Start-Service sshd',
1098 1102 'Write-Output "modifying sshd_config"',
1099 1103 r'$content = Get-Content C:\ProgramData\ssh\sshd_config',
1100 1104 '$content = $content -replace "Match Group administrators","" -replace "AuthorizedKeysFile __PROGRAMDATA__/ssh/administrators_authorized_keys",""',
1101 1105 r'$content | Set-Content C:\ProgramData\ssh\sshd_config',
1102 1106 'Import-Module OpenSSHUtils',
1103 1107 r'Repair-SshdConfigPermission C:\ProgramData\ssh\sshd_config -Confirm:$false',
1104 1108 'Restart-Service sshd',
1105 1109 'Write-Output "installing OpenSSL client"',
1106 1110 'Add-WindowsCapability -Online -Name OpenSSH.Client~~~~0.0.1.0',
1107 1111 'Set-Service -Name sshd -StartupType "Automatic"',
1108 1112 'Write-Output "OpenSSH server running"',
1109 1113 ]
1110 1114
1111 1115 with INSTALL_WINDOWS_DEPENDENCIES.open('r', encoding='utf-8') as fh:
1112 1116 commands.extend(l.rstrip() for l in fh)
1113 1117
1118 # Schedule run of EC2Launch on next boot. This ensures that UserData
1119 # is executed.
1120 # We disable setComputerName because it forces a reboot.
1121 # We set an explicit admin password because this causes UserData to run
1122 # as Administrator instead of System.
1123 commands.extend(
1124 [
1125 r'''Set-Content -Path C:\ProgramData\Amazon\EC2-Windows\Launch\Config\LaunchConfig.json '''
1126 r'''-Value '{"setComputerName": false, "setWallpaper": true, "addDnsSuffixList": true, '''
1127 r'''"extendBootVolumeSize": true, "handleUserData": true, '''
1128 r'''"adminPasswordType": "Specify", "adminPassword": "%s"}' '''
1129 % c.automation.default_password(),
1130 r'C:\ProgramData\Amazon\EC2-Windows\Launch\Scripts\InitializeInstance.ps1 '
1131 r'–Schedule',
1132 ]
1133 )
1134
1114 1135 # Disable Windows Defender when bootstrapping because it just slows
1115 1136 # things down.
1116 1137 commands.insert(0, 'Set-MpPreference -DisableRealtimeMonitoring $true')
1117 1138 commands.append('Set-MpPreference -DisableRealtimeMonitoring $false')
1118 1139
1119 1140 # Compute a deterministic fingerprint to determine whether image needs
1120 1141 # to be regenerated.
1121 1142 fingerprint = resolve_fingerprint(
1122 1143 {
1123 1144 'instance_config': config,
1124 1145 'user_data': WINDOWS_USER_DATA,
1125 1146 'initial_bootstrap': WINDOWS_BOOTSTRAP_POWERSHELL,
1126 1147 'bootstrap_commands': commands,
1127 1148 'base_image_name': base_image_name,
1128 1149 }
1129 1150 )
1130 1151
1131 1152 existing_image = find_and_reconcile_image(ec2resource, name, fingerprint)
1132 1153
1133 1154 if existing_image:
1134 1155 return existing_image
1135 1156
1136 1157 print('no suitable Windows development image found; creating one...')
1137 1158
1138 with create_temp_windows_ec2_instances(c, config) as instances:
1159 with create_temp_windows_ec2_instances(
1160 c, config, bootstrap=True
1161 ) as instances:
1139 1162 assert len(instances) == 1
1140 1163 instance = instances[0]
1141 1164
1142 1165 wait_for_ssm(ssmclient, [instance])
1143 1166
1144 1167 # On first boot, install various Windows updates.
1145 1168 # We would ideally use PowerShell Remoting for this. However, there are
1146 1169 # trust issues that make it difficult to invoke Windows Update
1147 1170 # remotely. So we use SSM, which has a mechanism for running Windows
1148 1171 # Update.
1149 1172 print('installing Windows features...')
1150 1173 run_ssm_command(
1151 1174 ssmclient,
1152 1175 [instance],
1153 1176 'AWS-RunPowerShellScript',
1154 1177 {'commands': WINDOWS_BOOTSTRAP_POWERSHELL.split('\n'),},
1155 1178 )
1156 1179
1157 1180 # Reboot so all updates are fully applied.
1158 1181 #
1159 1182 # We don't use instance.reboot() here because it is asynchronous and
1160 1183 # we don't know when exactly the instance has rebooted. It could take
1161 1184 # a while to stop and we may start trying to interact with the instance
1162 1185 # before it has rebooted.
1163 1186 print('rebooting instance %s' % instance.id)
1164 1187 instance.stop()
1165 1188 ec2client.get_waiter('instance_stopped').wait(
1166 1189 InstanceIds=[instance.id], WaiterConfig={'Delay': 5,}
1167 1190 )
1168 1191
1169 1192 instance.start()
1170 1193 wait_for_ip_addresses([instance])
1171 1194
1172 1195 # There is a race condition here between the User Data PS script running
1173 1196 # and us connecting to WinRM. This can manifest as
1174 1197 # "AuthorizationManager check failed" failures during run_powershell().
1175 1198 # TODO figure out a workaround.
1176 1199
1177 1200 print('waiting for Windows Remote Management to come back...')
1178 1201 client = wait_for_winrm(
1179 1202 instance.public_ip_address,
1180 1203 'Administrator',
1181 1204 c.automation.default_password(),
1182 1205 )
1183 1206 print('established WinRM connection to %s' % instance.id)
1184 1207 instance.winrm_client = client
1185 1208
1186 1209 print('bootstrapping instance...')
1187 1210 run_powershell(instance.winrm_client, '\n'.join(commands))
1188 1211
1189 1212 print('bootstrap completed; stopping %s to create image' % instance.id)
1190 1213 return create_ami_from_instance(
1191 1214 ec2client,
1192 1215 instance,
1193 1216 name,
1194 1217 'Mercurial Windows development environment',
1195 1218 fingerprint,
1196 1219 )
1197 1220
1198 1221
1199 1222 @contextlib.contextmanager
1200 1223 def temporary_windows_dev_instances(
1201 1224 c: AWSConnection,
1202 1225 image,
1203 1226 instance_type,
1204 1227 prefix='hg-',
1205 1228 disable_antivirus=False,
1206 1229 ):
1207 1230 """Create a temporary Windows development EC2 instance.
1208 1231
1209 1232 Context manager resolves to the list of ``EC2.Instance`` that were created.
1210 1233 """
1211 1234 config = {
1212 1235 'BlockDeviceMappings': [
1213 1236 {
1214 1237 'DeviceName': '/dev/sda1',
1215 1238 'Ebs': {
1216 1239 'DeleteOnTermination': True,
1217 1240 'VolumeSize': 32,
1218 1241 'VolumeType': 'gp2',
1219 1242 },
1220 1243 }
1221 1244 ],
1222 1245 'ImageId': image.id,
1223 1246 'InstanceInitiatedShutdownBehavior': 'stop',
1224 1247 'InstanceType': instance_type,
1225 1248 'KeyName': '%sautomation' % prefix,
1226 1249 'MaxCount': 1,
1227 1250 'MinCount': 1,
1228 1251 'SecurityGroupIds': [c.security_groups['windows-dev-1'].id],
1229 1252 }
1230 1253
1231 1254 with create_temp_windows_ec2_instances(c, config) as instances:
1232 1255 if disable_antivirus:
1233 1256 for instance in instances:
1234 1257 run_powershell(
1235 1258 instance.winrm_client,
1236 1259 'Set-MpPreference -DisableRealtimeMonitoring $true',
1237 1260 )
1238 1261
1239 1262 yield instances
General Comments 0
You need to be logged in to leave comments. Login now