##// END OF EJS Templates
automation: use latest AMIs...
Gregory Szorc -
r43287:6952d42f default
parent child Browse files
Show More
@@ -1,1202 +1,1202 b''
1 1 # aws.py - Automation code for Amazon Web Services
2 2 #
3 3 # Copyright 2019 Gregory Szorc <gregory.szorc@gmail.com>
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 # no-check-code because Python 3 native.
9 9
10 10 import contextlib
11 11 import copy
12 12 import hashlib
13 13 import json
14 14 import os
15 15 import pathlib
16 16 import subprocess
17 17 import time
18 18
19 19 import boto3
20 20 import botocore.exceptions
21 21
22 22 from .linux import (
23 23 BOOTSTRAP_DEBIAN,
24 24 )
25 25 from .ssh import (
26 26 exec_command as ssh_exec_command,
27 27 wait_for_ssh,
28 28 )
29 29 from .winrm import (
30 30 run_powershell,
31 31 wait_for_winrm,
32 32 )
33 33
34 34
35 35 SOURCE_ROOT = pathlib.Path(os.path.abspath(__file__)).parent.parent.parent.parent
36 36
37 37 INSTALL_WINDOWS_DEPENDENCIES = (SOURCE_ROOT / 'contrib' /
38 38 'install-windows-dependencies.ps1')
39 39
40 40
41 41 INSTANCE_TYPES_WITH_STORAGE = {
42 42 'c5d',
43 43 'd2',
44 44 'h1',
45 45 'i3',
46 46 'm5ad',
47 47 'm5d',
48 48 'r5d',
49 49 'r5ad',
50 50 'x1',
51 51 'z1d',
52 52 }
53 53
54 54
55 55 AMAZON_ACCOUNT_ID = '801119661308'
56 56 DEBIAN_ACCOUNT_ID = '379101102735'
57 57 UBUNTU_ACCOUNT_ID = '099720109477'
58 58
59 59
60 60 WINDOWS_BASE_IMAGE_NAME = 'Windows_Server-2019-English-Full-Base-2019.07.12'
61 61
62 62
63 63 KEY_PAIRS = {
64 64 'automation',
65 65 }
66 66
67 67
68 68 SECURITY_GROUPS = {
69 69 'linux-dev-1': {
70 70 'description': 'Mercurial Linux instances that perform build/test automation',
71 71 'ingress': [
72 72 {
73 73 'FromPort': 22,
74 74 'ToPort': 22,
75 75 'IpProtocol': 'tcp',
76 76 'IpRanges': [
77 77 {
78 78 'CidrIp': '0.0.0.0/0',
79 79 'Description': 'SSH from entire Internet',
80 80 },
81 81 ],
82 82 },
83 83 ],
84 84 },
85 85 'windows-dev-1': {
86 86 'description': 'Mercurial Windows instances that perform build automation',
87 87 'ingress': [
88 88 {
89 89 'FromPort': 22,
90 90 'ToPort': 22,
91 91 'IpProtocol': 'tcp',
92 92 'IpRanges': [
93 93 {
94 94 'CidrIp': '0.0.0.0/0',
95 95 'Description': 'SSH from entire Internet',
96 96 },
97 97 ],
98 98 },
99 99 {
100 100 'FromPort': 3389,
101 101 'ToPort': 3389,
102 102 'IpProtocol': 'tcp',
103 103 'IpRanges': [
104 104 {
105 105 'CidrIp': '0.0.0.0/0',
106 106 'Description': 'RDP from entire Internet',
107 107 },
108 108 ],
109 109
110 110 },
111 111 {
112 112 'FromPort': 5985,
113 113 'ToPort': 5986,
114 114 'IpProtocol': 'tcp',
115 115 'IpRanges': [
116 116 {
117 117 'CidrIp': '0.0.0.0/0',
118 118 'Description': 'PowerShell Remoting (Windows Remote Management)',
119 119 },
120 120 ],
121 121 }
122 122 ],
123 123 },
124 124 }
125 125
126 126
127 127 IAM_ROLES = {
128 128 'ephemeral-ec2-role-1': {
129 129 'description': 'Mercurial temporary EC2 instances',
130 130 'policy_arns': [
131 131 'arn:aws:iam::aws:policy/service-role/AmazonEC2RoleforSSM',
132 132 ],
133 133 },
134 134 }
135 135
136 136
137 137 ASSUME_ROLE_POLICY_DOCUMENT = '''
138 138 {
139 139 "Version": "2012-10-17",
140 140 "Statement": [
141 141 {
142 142 "Effect": "Allow",
143 143 "Principal": {
144 144 "Service": "ec2.amazonaws.com"
145 145 },
146 146 "Action": "sts:AssumeRole"
147 147 }
148 148 ]
149 149 }
150 150 '''.strip()
151 151
152 152
153 153 IAM_INSTANCE_PROFILES = {
154 154 'ephemeral-ec2-1': {
155 155 'roles': [
156 156 'ephemeral-ec2-role-1',
157 157 ],
158 158 }
159 159 }
160 160
161 161
162 162 # User Data for Windows EC2 instance. Mainly used to set the password
163 163 # and configure WinRM.
164 164 # Inspired by the User Data script used by Packer
165 165 # (from https://www.packer.io/intro/getting-started/build-image.html).
166 166 WINDOWS_USER_DATA = r'''
167 167 <powershell>
168 168
169 169 # TODO enable this once we figure out what is failing.
170 170 #$ErrorActionPreference = "stop"
171 171
172 172 # Set administrator password
173 173 net user Administrator "%s"
174 174 wmic useraccount where "name='Administrator'" set PasswordExpires=FALSE
175 175
176 176 # First, make sure WinRM can't be connected to
177 177 netsh advfirewall firewall set rule name="Windows Remote Management (HTTP-In)" new enable=yes action=block
178 178
179 179 # Delete any existing WinRM listeners
180 180 winrm delete winrm/config/listener?Address=*+Transport=HTTP 2>$Null
181 181 winrm delete winrm/config/listener?Address=*+Transport=HTTPS 2>$Null
182 182
183 183 # Create a new WinRM listener and configure
184 184 winrm create winrm/config/listener?Address=*+Transport=HTTP
185 185 winrm set winrm/config/winrs '@{MaxMemoryPerShellMB="0"}'
186 186 winrm set winrm/config '@{MaxTimeoutms="7200000"}'
187 187 winrm set winrm/config/service '@{AllowUnencrypted="true"}'
188 188 winrm set winrm/config/service '@{MaxConcurrentOperationsPerUser="12000"}'
189 189 winrm set winrm/config/service/auth '@{Basic="true"}'
190 190 winrm set winrm/config/client/auth '@{Basic="true"}'
191 191
192 192 # Configure UAC to allow privilege elevation in remote shells
193 193 $Key = 'HKLM:\SOFTWARE\Microsoft\Windows\CurrentVersion\Policies\System'
194 194 $Setting = 'LocalAccountTokenFilterPolicy'
195 195 Set-ItemProperty -Path $Key -Name $Setting -Value 1 -Force
196 196
197 197 # Configure and restart the WinRM Service; Enable the required firewall exception
198 198 Stop-Service -Name WinRM
199 199 Set-Service -Name WinRM -StartupType Automatic
200 200 netsh advfirewall firewall set rule name="Windows Remote Management (HTTP-In)" new action=allow localip=any remoteip=any
201 201 Start-Service -Name WinRM
202 202
203 203 # Disable firewall on private network interfaces so prompts don't appear.
204 204 Set-NetFirewallProfile -Name private -Enabled false
205 205 </powershell>
206 206 '''.lstrip()
207 207
208 208
209 209 WINDOWS_BOOTSTRAP_POWERSHELL = '''
210 210 Write-Output "installing PowerShell dependencies"
211 211 Install-PackageProvider -Name NuGet -MinimumVersion 2.8.5.201 -Force
212 212 Set-PSRepository -Name PSGallery -InstallationPolicy Trusted
213 213 Install-Module -Name OpenSSHUtils -RequiredVersion 0.0.2.0
214 214
215 215 Write-Output "installing OpenSSL server"
216 216 Add-WindowsCapability -Online -Name OpenSSH.Server~~~~0.0.1.0
217 217 # Various tools will attempt to use older versions of .NET. So we enable
218 218 # the feature that provides them so it doesn't have to be auto-enabled
219 219 # later.
220 220 Write-Output "enabling .NET Framework feature"
221 221 Install-WindowsFeature -Name Net-Framework-Core
222 222 '''
223 223
224 224
225 225 class AWSConnection:
226 226 """Manages the state of a connection with AWS."""
227 227
228 228 def __init__(self, automation, region: str, ensure_ec2_state: bool=True):
229 229 self.automation = automation
230 230 self.local_state_path = automation.state_path
231 231
232 232 self.prefix = 'hg-'
233 233
234 234 self.session = boto3.session.Session(region_name=region)
235 235 self.ec2client = self.session.client('ec2')
236 236 self.ec2resource = self.session.resource('ec2')
237 237 self.iamclient = self.session.client('iam')
238 238 self.iamresource = self.session.resource('iam')
239 239 self.security_groups = {}
240 240
241 241 if ensure_ec2_state:
242 242 ensure_key_pairs(automation.state_path, self.ec2resource)
243 243 self.security_groups = ensure_security_groups(self.ec2resource)
244 244 ensure_iam_state(self.iamclient, self.iamresource)
245 245
246 246 def key_pair_path_private(self, name):
247 247 """Path to a key pair private key file."""
248 248 return self.local_state_path / 'keys' / ('keypair-%s' % name)
249 249
250 250 def key_pair_path_public(self, name):
251 251 return self.local_state_path / 'keys' / ('keypair-%s.pub' % name)
252 252
253 253
254 254 def rsa_key_fingerprint(p: pathlib.Path):
255 255 """Compute the fingerprint of an RSA private key."""
256 256
257 257 # TODO use rsa package.
258 258 res = subprocess.run(
259 259 ['openssl', 'pkcs8', '-in', str(p), '-nocrypt', '-topk8',
260 260 '-outform', 'DER'],
261 261 capture_output=True,
262 262 check=True)
263 263
264 264 sha1 = hashlib.sha1(res.stdout).hexdigest()
265 265 return ':'.join(a + b for a, b in zip(sha1[::2], sha1[1::2]))
266 266
267 267
268 268 def ensure_key_pairs(state_path: pathlib.Path, ec2resource, prefix='hg-'):
269 269 remote_existing = {}
270 270
271 271 for kpi in ec2resource.key_pairs.all():
272 272 if kpi.name.startswith(prefix):
273 273 remote_existing[kpi.name[len(prefix):]] = kpi.key_fingerprint
274 274
275 275 # Validate that we have these keys locally.
276 276 key_path = state_path / 'keys'
277 277 key_path.mkdir(exist_ok=True, mode=0o700)
278 278
279 279 def remove_remote(name):
280 280 print('deleting key pair %s' % name)
281 281 key = ec2resource.KeyPair(name)
282 282 key.delete()
283 283
284 284 def remove_local(name):
285 285 pub_full = key_path / ('keypair-%s.pub' % name)
286 286 priv_full = key_path / ('keypair-%s' % name)
287 287
288 288 print('removing %s' % pub_full)
289 289 pub_full.unlink()
290 290 print('removing %s' % priv_full)
291 291 priv_full.unlink()
292 292
293 293 local_existing = {}
294 294
295 295 for f in sorted(os.listdir(key_path)):
296 296 if not f.startswith('keypair-') or not f.endswith('.pub'):
297 297 continue
298 298
299 299 name = f[len('keypair-'):-len('.pub')]
300 300
301 301 pub_full = key_path / f
302 302 priv_full = key_path / ('keypair-%s' % name)
303 303
304 304 with open(pub_full, 'r', encoding='ascii') as fh:
305 305 data = fh.read()
306 306
307 307 if not data.startswith('ssh-rsa '):
308 308 print('unexpected format for key pair file: %s; removing' %
309 309 pub_full)
310 310 pub_full.unlink()
311 311 priv_full.unlink()
312 312 continue
313 313
314 314 local_existing[name] = rsa_key_fingerprint(priv_full)
315 315
316 316 for name in sorted(set(remote_existing) | set(local_existing)):
317 317 if name not in local_existing:
318 318 actual = '%s%s' % (prefix, name)
319 319 print('remote key %s does not exist locally' % name)
320 320 remove_remote(actual)
321 321 del remote_existing[name]
322 322
323 323 elif name not in remote_existing:
324 324 print('local key %s does not exist remotely' % name)
325 325 remove_local(name)
326 326 del local_existing[name]
327 327
328 328 elif remote_existing[name] != local_existing[name]:
329 329 print('key fingerprint mismatch for %s; '
330 330 'removing from local and remote' % name)
331 331 remove_local(name)
332 332 remove_remote('%s%s' % (prefix, name))
333 333 del local_existing[name]
334 334 del remote_existing[name]
335 335
336 336 missing = KEY_PAIRS - set(remote_existing)
337 337
338 338 for name in sorted(missing):
339 339 actual = '%s%s' % (prefix, name)
340 340 print('creating key pair %s' % actual)
341 341
342 342 priv_full = key_path / ('keypair-%s' % name)
343 343 pub_full = key_path / ('keypair-%s.pub' % name)
344 344
345 345 kp = ec2resource.create_key_pair(KeyName=actual)
346 346
347 347 with priv_full.open('w', encoding='ascii') as fh:
348 348 fh.write(kp.key_material)
349 349 fh.write('\n')
350 350
351 351 priv_full.chmod(0o0600)
352 352
353 353 # SSH public key can be extracted via `ssh-keygen`.
354 354 with pub_full.open('w', encoding='ascii') as fh:
355 355 subprocess.run(
356 356 ['ssh-keygen', '-y', '-f', str(priv_full)],
357 357 stdout=fh,
358 358 check=True)
359 359
360 360 pub_full.chmod(0o0600)
361 361
362 362
363 363 def delete_instance_profile(profile):
364 364 for role in profile.roles:
365 365 print('removing role %s from instance profile %s' % (role.name,
366 366 profile.name))
367 367 profile.remove_role(RoleName=role.name)
368 368
369 369 print('deleting instance profile %s' % profile.name)
370 370 profile.delete()
371 371
372 372
373 373 def ensure_iam_state(iamclient, iamresource, prefix='hg-'):
374 374 """Ensure IAM state is in sync with our canonical definition."""
375 375
376 376 remote_profiles = {}
377 377
378 378 for profile in iamresource.instance_profiles.all():
379 379 if profile.name.startswith(prefix):
380 380 remote_profiles[profile.name[len(prefix):]] = profile
381 381
382 382 for name in sorted(set(remote_profiles) - set(IAM_INSTANCE_PROFILES)):
383 383 delete_instance_profile(remote_profiles[name])
384 384 del remote_profiles[name]
385 385
386 386 remote_roles = {}
387 387
388 388 for role in iamresource.roles.all():
389 389 if role.name.startswith(prefix):
390 390 remote_roles[role.name[len(prefix):]] = role
391 391
392 392 for name in sorted(set(remote_roles) - set(IAM_ROLES)):
393 393 role = remote_roles[name]
394 394
395 395 print('removing role %s' % role.name)
396 396 role.delete()
397 397 del remote_roles[name]
398 398
399 399 # We've purged remote state that doesn't belong. Create missing
400 400 # instance profiles and roles.
401 401 for name in sorted(set(IAM_INSTANCE_PROFILES) - set(remote_profiles)):
402 402 actual = '%s%s' % (prefix, name)
403 403 print('creating IAM instance profile %s' % actual)
404 404
405 405 profile = iamresource.create_instance_profile(
406 406 InstanceProfileName=actual)
407 407 remote_profiles[name] = profile
408 408
409 409 waiter = iamclient.get_waiter('instance_profile_exists')
410 410 waiter.wait(InstanceProfileName=actual)
411 411 print('IAM instance profile %s is available' % actual)
412 412
413 413 for name in sorted(set(IAM_ROLES) - set(remote_roles)):
414 414 entry = IAM_ROLES[name]
415 415
416 416 actual = '%s%s' % (prefix, name)
417 417 print('creating IAM role %s' % actual)
418 418
419 419 role = iamresource.create_role(
420 420 RoleName=actual,
421 421 Description=entry['description'],
422 422 AssumeRolePolicyDocument=ASSUME_ROLE_POLICY_DOCUMENT,
423 423 )
424 424
425 425 waiter = iamclient.get_waiter('role_exists')
426 426 waiter.wait(RoleName=actual)
427 427 print('IAM role %s is available' % actual)
428 428
429 429 remote_roles[name] = role
430 430
431 431 for arn in entry['policy_arns']:
432 432 print('attaching policy %s to %s' % (arn, role.name))
433 433 role.attach_policy(PolicyArn=arn)
434 434
435 435 # Now reconcile state of profiles.
436 436 for name, meta in sorted(IAM_INSTANCE_PROFILES.items()):
437 437 profile = remote_profiles[name]
438 438 wanted = {'%s%s' % (prefix, role) for role in meta['roles']}
439 439 have = {role.name for role in profile.roles}
440 440
441 441 for role in sorted(have - wanted):
442 442 print('removing role %s from %s' % (role, profile.name))
443 443 profile.remove_role(RoleName=role)
444 444
445 445 for role in sorted(wanted - have):
446 446 print('adding role %s to %s' % (role, profile.name))
447 447 profile.add_role(RoleName=role)
448 448
449 449
450 450 def find_image(ec2resource, owner_id, name):
451 451 """Find an AMI by its owner ID and name."""
452 452
453 453 images = ec2resource.images.filter(
454 454 Filters=[
455 455 {
456 456 'Name': 'owner-id',
457 457 'Values': [owner_id],
458 458 },
459 459 {
460 460 'Name': 'state',
461 461 'Values': ['available'],
462 462 },
463 463 {
464 464 'Name': 'image-type',
465 465 'Values': ['machine'],
466 466 },
467 467 {
468 468 'Name': 'name',
469 469 'Values': [name],
470 470 },
471 471 ])
472 472
473 473 for image in images:
474 474 return image
475 475
476 476 raise Exception('unable to find image for %s' % name)
477 477
478 478
479 479 def ensure_security_groups(ec2resource, prefix='hg-'):
480 480 """Ensure all necessary Mercurial security groups are present.
481 481
482 482 All security groups are prefixed with ``hg-`` by default. Any security
483 483 groups having this prefix but aren't in our list are deleted.
484 484 """
485 485 existing = {}
486 486
487 487 for group in ec2resource.security_groups.all():
488 488 if group.group_name.startswith(prefix):
489 489 existing[group.group_name[len(prefix):]] = group
490 490
491 491 purge = set(existing) - set(SECURITY_GROUPS)
492 492
493 493 for name in sorted(purge):
494 494 group = existing[name]
495 495 print('removing legacy security group: %s' % group.group_name)
496 496 group.delete()
497 497
498 498 security_groups = {}
499 499
500 500 for name, group in sorted(SECURITY_GROUPS.items()):
501 501 if name in existing:
502 502 security_groups[name] = existing[name]
503 503 continue
504 504
505 505 actual = '%s%s' % (prefix, name)
506 506 print('adding security group %s' % actual)
507 507
508 508 group_res = ec2resource.create_security_group(
509 509 Description=group['description'],
510 510 GroupName=actual,
511 511 )
512 512
513 513 group_res.authorize_ingress(
514 514 IpPermissions=group['ingress'],
515 515 )
516 516
517 517 security_groups[name] = group_res
518 518
519 519 return security_groups
520 520
521 521
522 522 def terminate_ec2_instances(ec2resource, prefix='hg-'):
523 523 """Terminate all EC2 instances managed by us."""
524 524 waiting = []
525 525
526 526 for instance in ec2resource.instances.all():
527 527 if instance.state['Name'] == 'terminated':
528 528 continue
529 529
530 530 for tag in instance.tags or []:
531 531 if tag['Key'] == 'Name' and tag['Value'].startswith(prefix):
532 532 print('terminating %s' % instance.id)
533 533 instance.terminate()
534 534 waiting.append(instance)
535 535
536 536 for instance in waiting:
537 537 instance.wait_until_terminated()
538 538
539 539
540 540 def remove_resources(c, prefix='hg-'):
541 541 """Purge all of our resources in this EC2 region."""
542 542 ec2resource = c.ec2resource
543 543 iamresource = c.iamresource
544 544
545 545 terminate_ec2_instances(ec2resource, prefix=prefix)
546 546
547 547 for image in ec2resource.images.filter(Owners=['self']):
548 548 if image.name.startswith(prefix):
549 549 remove_ami(ec2resource, image)
550 550
551 551 for group in ec2resource.security_groups.all():
552 552 if group.group_name.startswith(prefix):
553 553 print('removing security group %s' % group.group_name)
554 554 group.delete()
555 555
556 556 for profile in iamresource.instance_profiles.all():
557 557 if profile.name.startswith(prefix):
558 558 delete_instance_profile(profile)
559 559
560 560 for role in iamresource.roles.all():
561 561 if role.name.startswith(prefix):
562 562 for p in role.attached_policies.all():
563 563 print('detaching policy %s from %s' % (p.arn, role.name))
564 564 role.detach_policy(PolicyArn=p.arn)
565 565
566 566 print('removing role %s' % role.name)
567 567 role.delete()
568 568
569 569
570 570 def wait_for_ip_addresses(instances):
571 571 """Wait for the public IP addresses of an iterable of instances."""
572 572 for instance in instances:
573 573 while True:
574 574 if not instance.public_ip_address:
575 575 time.sleep(2)
576 576 instance.reload()
577 577 continue
578 578
579 579 print('public IP address for %s: %s' % (
580 580 instance.id, instance.public_ip_address))
581 581 break
582 582
583 583
584 584 def remove_ami(ec2resource, image):
585 585 """Remove an AMI and its underlying snapshots."""
586 586 snapshots = []
587 587
588 588 for device in image.block_device_mappings:
589 589 if 'Ebs' in device:
590 590 snapshots.append(ec2resource.Snapshot(device['Ebs']['SnapshotId']))
591 591
592 592 print('deregistering %s' % image.id)
593 593 image.deregister()
594 594
595 595 for snapshot in snapshots:
596 596 print('deleting snapshot %s' % snapshot.id)
597 597 snapshot.delete()
598 598
599 599
600 600 def wait_for_ssm(ssmclient, instances):
601 601 """Wait for SSM to come online for an iterable of instance IDs."""
602 602 while True:
603 603 res = ssmclient.describe_instance_information(
604 604 Filters=[
605 605 {
606 606 'Key': 'InstanceIds',
607 607 'Values': [i.id for i in instances],
608 608 },
609 609 ],
610 610 )
611 611
612 612 available = len(res['InstanceInformationList'])
613 613 wanted = len(instances)
614 614
615 615 print('%d/%d instances available in SSM' % (available, wanted))
616 616
617 617 if available == wanted:
618 618 return
619 619
620 620 time.sleep(2)
621 621
622 622
623 623 def run_ssm_command(ssmclient, instances, document_name, parameters):
624 624 """Run a PowerShell script on an EC2 instance."""
625 625
626 626 res = ssmclient.send_command(
627 627 InstanceIds=[i.id for i in instances],
628 628 DocumentName=document_name,
629 629 Parameters=parameters,
630 630 CloudWatchOutputConfig={
631 631 'CloudWatchOutputEnabled': True,
632 632 },
633 633 )
634 634
635 635 command_id = res['Command']['CommandId']
636 636
637 637 for instance in instances:
638 638 while True:
639 639 try:
640 640 res = ssmclient.get_command_invocation(
641 641 CommandId=command_id,
642 642 InstanceId=instance.id,
643 643 )
644 644 except botocore.exceptions.ClientError as e:
645 645 if e.response['Error']['Code'] == 'InvocationDoesNotExist':
646 646 print('could not find SSM command invocation; waiting')
647 647 time.sleep(1)
648 648 continue
649 649 else:
650 650 raise
651 651
652 652 if res['Status'] == 'Success':
653 653 break
654 654 elif res['Status'] in ('Pending', 'InProgress', 'Delayed'):
655 655 time.sleep(2)
656 656 else:
657 657 raise Exception('command failed on %s: %s' % (
658 658 instance.id, res['Status']))
659 659
660 660
661 661 @contextlib.contextmanager
662 662 def temporary_ec2_instances(ec2resource, config):
663 663 """Create temporary EC2 instances.
664 664
665 665 This is a proxy to ``ec2client.run_instances(**config)`` that takes care of
666 666 managing the lifecycle of the instances.
667 667
668 668 When the context manager exits, the instances are terminated.
669 669
670 670 The context manager evaluates to the list of data structures
671 671 describing each created instance. The instances may not be available
672 672 for work immediately: it is up to the caller to wait for the instance
673 673 to start responding.
674 674 """
675 675
676 676 ids = None
677 677
678 678 try:
679 679 res = ec2resource.create_instances(**config)
680 680
681 681 ids = [i.id for i in res]
682 682 print('started instances: %s' % ' '.join(ids))
683 683
684 684 yield res
685 685 finally:
686 686 if ids:
687 687 print('terminating instances: %s' % ' '.join(ids))
688 688 for instance in res:
689 689 instance.terminate()
690 690 print('terminated %d instances' % len(ids))
691 691
692 692
693 693 @contextlib.contextmanager
694 694 def create_temp_windows_ec2_instances(c: AWSConnection, config):
695 695 """Create temporary Windows EC2 instances.
696 696
697 697 This is a higher-level wrapper around ``create_temp_ec2_instances()`` that
698 698 configures the Windows instance for Windows Remote Management. The emitted
699 699 instances will have a ``winrm_client`` attribute containing a
700 700 ``pypsrp.client.Client`` instance bound to the instance.
701 701 """
702 702 if 'IamInstanceProfile' in config:
703 703 raise ValueError('IamInstanceProfile cannot be provided in config')
704 704 if 'UserData' in config:
705 705 raise ValueError('UserData cannot be provided in config')
706 706
707 707 password = c.automation.default_password()
708 708
709 709 config = copy.deepcopy(config)
710 710 config['IamInstanceProfile'] = {
711 711 'Name': 'hg-ephemeral-ec2-1',
712 712 }
713 713 config.setdefault('TagSpecifications', []).append({
714 714 'ResourceType': 'instance',
715 715 'Tags': [{'Key': 'Name', 'Value': 'hg-temp-windows'}],
716 716 })
717 717 config['UserData'] = WINDOWS_USER_DATA % password
718 718
719 719 with temporary_ec2_instances(c.ec2resource, config) as instances:
720 720 wait_for_ip_addresses(instances)
721 721
722 722 print('waiting for Windows Remote Management service...')
723 723
724 724 for instance in instances:
725 725 client = wait_for_winrm(instance.public_ip_address, 'Administrator', password)
726 726 print('established WinRM connection to %s' % instance.id)
727 727 instance.winrm_client = client
728 728
729 729 yield instances
730 730
731 731
732 732 def resolve_fingerprint(fingerprint):
733 733 fingerprint = json.dumps(fingerprint, sort_keys=True)
734 734 return hashlib.sha256(fingerprint.encode('utf-8')).hexdigest()
735 735
736 736
737 737 def find_and_reconcile_image(ec2resource, name, fingerprint):
738 738 """Attempt to find an existing EC2 AMI with a name and fingerprint.
739 739
740 740 If an image with the specified fingerprint is found, it is returned.
741 741 Otherwise None is returned.
742 742
743 743 Existing images for the specified name that don't have the specified
744 744 fingerprint or are missing required metadata or deleted.
745 745 """
746 746 # Find existing AMIs with this name and delete the ones that are invalid.
747 747 # Store a reference to a good image so it can be returned one the
748 748 # image state is reconciled.
749 749 images = ec2resource.images.filter(
750 750 Filters=[{'Name': 'name', 'Values': [name]}])
751 751
752 752 existing_image = None
753 753
754 754 for image in images:
755 755 if image.tags is None:
756 756 print('image %s for %s lacks required tags; removing' % (
757 757 image.id, image.name))
758 758 remove_ami(ec2resource, image)
759 759 else:
760 760 tags = {t['Key']: t['Value'] for t in image.tags}
761 761
762 762 if tags.get('HGIMAGEFINGERPRINT') == fingerprint:
763 763 existing_image = image
764 764 else:
765 765 print('image %s for %s has wrong fingerprint; removing' % (
766 766 image.id, image.name))
767 767 remove_ami(ec2resource, image)
768 768
769 769 return existing_image
770 770
771 771
772 772 def create_ami_from_instance(ec2client, instance, name, description,
773 773 fingerprint):
774 774 """Create an AMI from a running instance.
775 775
776 776 Returns the ``ec2resource.Image`` representing the created AMI.
777 777 """
778 778 instance.stop()
779 779
780 780 ec2client.get_waiter('instance_stopped').wait(
781 781 InstanceIds=[instance.id],
782 782 WaiterConfig={
783 783 'Delay': 5,
784 784 })
785 785 print('%s is stopped' % instance.id)
786 786
787 787 image = instance.create_image(
788 788 Name=name,
789 789 Description=description,
790 790 )
791 791
792 792 image.create_tags(Tags=[
793 793 {
794 794 'Key': 'HGIMAGEFINGERPRINT',
795 795 'Value': fingerprint,
796 796 },
797 797 ])
798 798
799 799 print('waiting for image %s' % image.id)
800 800
801 801 ec2client.get_waiter('image_available').wait(
802 802 ImageIds=[image.id],
803 803 )
804 804
805 805 print('image %s available as %s' % (image.id, image.name))
806 806
807 807 return image
808 808
809 809
810 810 def ensure_linux_dev_ami(c: AWSConnection, distro='debian9', prefix='hg-'):
811 811 """Ensures a Linux development AMI is available and up-to-date.
812 812
813 813 Returns an ``ec2.Image`` of either an existing AMI or a newly-built one.
814 814 """
815 815 ec2client = c.ec2client
816 816 ec2resource = c.ec2resource
817 817
818 818 name = '%s%s-%s' % (prefix, 'linux-dev', distro)
819 819
820 820 if distro == 'debian9':
821 821 image = find_image(
822 822 ec2resource,
823 823 DEBIAN_ACCOUNT_ID,
824 'debian-stretch-hvm-x86_64-gp2-2019-02-19-26620',
824 'debian-stretch-hvm-x86_64-gp2-2019-09-08-17994',
825 825 )
826 826 ssh_username = 'admin'
827 827 elif distro == 'ubuntu18.04':
828 828 image = find_image(
829 829 ec2resource,
830 830 UBUNTU_ACCOUNT_ID,
831 'ubuntu/images/hvm-ssd/ubuntu-bionic-18.04-amd64-server-20190403',
831 'ubuntu/images/hvm-ssd/ubuntu-bionic-18.04-amd64-server-20190918',
832 832 )
833 833 ssh_username = 'ubuntu'
834 834 elif distro == 'ubuntu19.04':
835 835 image = find_image(
836 836 ec2resource,
837 837 UBUNTU_ACCOUNT_ID,
838 'ubuntu/images/hvm-ssd/ubuntu-disco-19.04-amd64-server-20190417',
838 'ubuntu/images/hvm-ssd/ubuntu-disco-19.04-amd64-server-20190918',
839 839 )
840 840 ssh_username = 'ubuntu'
841 841 else:
842 842 raise ValueError('unsupported Linux distro: %s' % distro)
843 843
844 844 config = {
845 845 'BlockDeviceMappings': [
846 846 {
847 847 'DeviceName': image.block_device_mappings[0]['DeviceName'],
848 848 'Ebs': {
849 849 'DeleteOnTermination': True,
850 850 'VolumeSize': 10,
851 851 'VolumeType': 'gp2',
852 852 },
853 853 },
854 854 ],
855 855 'EbsOptimized': True,
856 856 'ImageId': image.id,
857 857 'InstanceInitiatedShutdownBehavior': 'stop',
858 858 # 8 VCPUs for compiling Python.
859 859 'InstanceType': 't3.2xlarge',
860 860 'KeyName': '%sautomation' % prefix,
861 861 'MaxCount': 1,
862 862 'MinCount': 1,
863 863 'SecurityGroupIds': [c.security_groups['linux-dev-1'].id],
864 864 }
865 865
866 866 requirements2_path = (pathlib.Path(__file__).parent.parent /
867 867 'linux-requirements-py2.txt')
868 868 requirements3_path = (pathlib.Path(__file__).parent.parent /
869 869 'linux-requirements-py3.txt')
870 870 with requirements2_path.open('r', encoding='utf-8') as fh:
871 871 requirements2 = fh.read()
872 872 with requirements3_path.open('r', encoding='utf-8') as fh:
873 873 requirements3 = fh.read()
874 874
875 875 # Compute a deterministic fingerprint to determine whether image needs to
876 876 # be regenerated.
877 877 fingerprint = resolve_fingerprint({
878 878 'instance_config': config,
879 879 'bootstrap_script': BOOTSTRAP_DEBIAN,
880 880 'requirements_py2': requirements2,
881 881 'requirements_py3': requirements3,
882 882 })
883 883
884 884 existing_image = find_and_reconcile_image(ec2resource, name, fingerprint)
885 885
886 886 if existing_image:
887 887 return existing_image
888 888
889 889 print('no suitable %s image found; creating one...' % name)
890 890
891 891 with temporary_ec2_instances(ec2resource, config) as instances:
892 892 wait_for_ip_addresses(instances)
893 893
894 894 instance = instances[0]
895 895
896 896 client = wait_for_ssh(
897 897 instance.public_ip_address, 22,
898 898 username=ssh_username,
899 899 key_filename=str(c.key_pair_path_private('automation')))
900 900
901 901 home = '/home/%s' % ssh_username
902 902
903 903 with client:
904 904 print('connecting to SSH server')
905 905 sftp = client.open_sftp()
906 906
907 907 print('uploading bootstrap files')
908 908 with sftp.open('%s/bootstrap' % home, 'wb') as fh:
909 909 fh.write(BOOTSTRAP_DEBIAN)
910 910 fh.chmod(0o0700)
911 911
912 912 with sftp.open('%s/requirements-py2.txt' % home, 'wb') as fh:
913 913 fh.write(requirements2)
914 914 fh.chmod(0o0700)
915 915
916 916 with sftp.open('%s/requirements-py3.txt' % home, 'wb') as fh:
917 917 fh.write(requirements3)
918 918 fh.chmod(0o0700)
919 919
920 920 print('executing bootstrap')
921 921 chan, stdin, stdout = ssh_exec_command(client,
922 922 '%s/bootstrap' % home)
923 923 stdin.close()
924 924
925 925 for line in stdout:
926 926 print(line, end='')
927 927
928 928 res = chan.recv_exit_status()
929 929 if res:
930 930 raise Exception('non-0 exit from bootstrap: %d' % res)
931 931
932 932 print('bootstrap completed; stopping %s to create %s' % (
933 933 instance.id, name))
934 934
935 935 return create_ami_from_instance(ec2client, instance, name,
936 936 'Mercurial Linux development environment',
937 937 fingerprint)
938 938
939 939
940 940 @contextlib.contextmanager
941 941 def temporary_linux_dev_instances(c: AWSConnection, image, instance_type,
942 942 prefix='hg-', ensure_extra_volume=False):
943 943 """Create temporary Linux development EC2 instances.
944 944
945 945 Context manager resolves to a list of ``ec2.Instance`` that were created
946 946 and are running.
947 947
948 948 ``ensure_extra_volume`` can be set to ``True`` to require that instances
949 949 have a 2nd storage volume available other than the primary AMI volume.
950 950 For instance types with instance storage, this does nothing special.
951 951 But for instance types without instance storage, an additional EBS volume
952 952 will be added to the instance.
953 953
954 954 Instances have an ``ssh_client`` attribute containing a paramiko SSHClient
955 955 instance bound to the instance.
956 956
957 957 Instances have an ``ssh_private_key_path`` attributing containing the
958 958 str path to the SSH private key to connect to the instance.
959 959 """
960 960
961 961 block_device_mappings = [
962 962 {
963 963 'DeviceName': image.block_device_mappings[0]['DeviceName'],
964 964 'Ebs': {
965 965 'DeleteOnTermination': True,
966 966 'VolumeSize': 12,
967 967 'VolumeType': 'gp2',
968 968 },
969 969 }
970 970 ]
971 971
972 972 # This is not an exhaustive list of instance types having instance storage.
973 973 # But
974 974 if (ensure_extra_volume
975 975 and not instance_type.startswith(tuple(INSTANCE_TYPES_WITH_STORAGE))):
976 976 main_device = block_device_mappings[0]['DeviceName']
977 977
978 978 if main_device == 'xvda':
979 979 second_device = 'xvdb'
980 980 elif main_device == '/dev/sda1':
981 981 second_device = '/dev/sdb'
982 982 else:
983 983 raise ValueError('unhandled primary EBS device name: %s' %
984 984 main_device)
985 985
986 986 block_device_mappings.append({
987 987 'DeviceName': second_device,
988 988 'Ebs': {
989 989 'DeleteOnTermination': True,
990 990 'VolumeSize': 8,
991 991 'VolumeType': 'gp2',
992 992 }
993 993 })
994 994
995 995 config = {
996 996 'BlockDeviceMappings': block_device_mappings,
997 997 'EbsOptimized': True,
998 998 'ImageId': image.id,
999 999 'InstanceInitiatedShutdownBehavior': 'terminate',
1000 1000 'InstanceType': instance_type,
1001 1001 'KeyName': '%sautomation' % prefix,
1002 1002 'MaxCount': 1,
1003 1003 'MinCount': 1,
1004 1004 'SecurityGroupIds': [c.security_groups['linux-dev-1'].id],
1005 1005 }
1006 1006
1007 1007 with temporary_ec2_instances(c.ec2resource, config) as instances:
1008 1008 wait_for_ip_addresses(instances)
1009 1009
1010 1010 ssh_private_key_path = str(c.key_pair_path_private('automation'))
1011 1011
1012 1012 for instance in instances:
1013 1013 client = wait_for_ssh(
1014 1014 instance.public_ip_address, 22,
1015 1015 username='hg',
1016 1016 key_filename=ssh_private_key_path)
1017 1017
1018 1018 instance.ssh_client = client
1019 1019 instance.ssh_private_key_path = ssh_private_key_path
1020 1020
1021 1021 try:
1022 1022 yield instances
1023 1023 finally:
1024 1024 for instance in instances:
1025 1025 instance.ssh_client.close()
1026 1026
1027 1027
1028 1028 def ensure_windows_dev_ami(c: AWSConnection, prefix='hg-',
1029 1029 base_image_name=WINDOWS_BASE_IMAGE_NAME):
1030 1030 """Ensure Windows Development AMI is available and up-to-date.
1031 1031
1032 1032 If necessary, a modern AMI will be built by starting a temporary EC2
1033 1033 instance and bootstrapping it.
1034 1034
1035 1035 Obsolete AMIs will be deleted so there is only a single AMI having the
1036 1036 desired name.
1037 1037
1038 1038 Returns an ``ec2.Image`` of either an existing AMI or a newly-built
1039 1039 one.
1040 1040 """
1041 1041 ec2client = c.ec2client
1042 1042 ec2resource = c.ec2resource
1043 1043 ssmclient = c.session.client('ssm')
1044 1044
1045 1045 name = '%s%s' % (prefix, 'windows-dev')
1046 1046
1047 1047 image = find_image(ec2resource, AMAZON_ACCOUNT_ID, base_image_name)
1048 1048
1049 1049 config = {
1050 1050 'BlockDeviceMappings': [
1051 1051 {
1052 1052 'DeviceName': '/dev/sda1',
1053 1053 'Ebs': {
1054 1054 'DeleteOnTermination': True,
1055 1055 'VolumeSize': 32,
1056 1056 'VolumeType': 'gp2',
1057 1057 },
1058 1058 }
1059 1059 ],
1060 1060 'ImageId': image.id,
1061 1061 'InstanceInitiatedShutdownBehavior': 'stop',
1062 1062 'InstanceType': 't3.medium',
1063 1063 'KeyName': '%sautomation' % prefix,
1064 1064 'MaxCount': 1,
1065 1065 'MinCount': 1,
1066 1066 'SecurityGroupIds': [c.security_groups['windows-dev-1'].id],
1067 1067 }
1068 1068
1069 1069 commands = [
1070 1070 # Need to start the service so sshd_config is generated.
1071 1071 'Start-Service sshd',
1072 1072 'Write-Output "modifying sshd_config"',
1073 1073 r'$content = Get-Content C:\ProgramData\ssh\sshd_config',
1074 1074 '$content = $content -replace "Match Group administrators","" -replace "AuthorizedKeysFile __PROGRAMDATA__/ssh/administrators_authorized_keys",""',
1075 1075 r'$content | Set-Content C:\ProgramData\ssh\sshd_config',
1076 1076 'Import-Module OpenSSHUtils',
1077 1077 r'Repair-SshdConfigPermission C:\ProgramData\ssh\sshd_config -Confirm:$false',
1078 1078 'Restart-Service sshd',
1079 1079 'Write-Output "installing OpenSSL client"',
1080 1080 'Add-WindowsCapability -Online -Name OpenSSH.Client~~~~0.0.1.0',
1081 1081 'Set-Service -Name sshd -StartupType "Automatic"',
1082 1082 'Write-Output "OpenSSH server running"',
1083 1083 ]
1084 1084
1085 1085 with INSTALL_WINDOWS_DEPENDENCIES.open('r', encoding='utf-8') as fh:
1086 1086 commands.extend(l.rstrip() for l in fh)
1087 1087
1088 1088 # Disable Windows Defender when bootstrapping because it just slows
1089 1089 # things down.
1090 1090 commands.insert(0, 'Set-MpPreference -DisableRealtimeMonitoring $true')
1091 1091 commands.append('Set-MpPreference -DisableRealtimeMonitoring $false')
1092 1092
1093 1093 # Compute a deterministic fingerprint to determine whether image needs
1094 1094 # to be regenerated.
1095 1095 fingerprint = resolve_fingerprint({
1096 1096 'instance_config': config,
1097 1097 'user_data': WINDOWS_USER_DATA,
1098 1098 'initial_bootstrap': WINDOWS_BOOTSTRAP_POWERSHELL,
1099 1099 'bootstrap_commands': commands,
1100 1100 'base_image_name': base_image_name,
1101 1101 })
1102 1102
1103 1103 existing_image = find_and_reconcile_image(ec2resource, name, fingerprint)
1104 1104
1105 1105 if existing_image:
1106 1106 return existing_image
1107 1107
1108 1108 print('no suitable Windows development image found; creating one...')
1109 1109
1110 1110 with create_temp_windows_ec2_instances(c, config) as instances:
1111 1111 assert len(instances) == 1
1112 1112 instance = instances[0]
1113 1113
1114 1114 wait_for_ssm(ssmclient, [instance])
1115 1115
1116 1116 # On first boot, install various Windows updates.
1117 1117 # We would ideally use PowerShell Remoting for this. However, there are
1118 1118 # trust issues that make it difficult to invoke Windows Update
1119 1119 # remotely. So we use SSM, which has a mechanism for running Windows
1120 1120 # Update.
1121 1121 print('installing Windows features...')
1122 1122 run_ssm_command(
1123 1123 ssmclient,
1124 1124 [instance],
1125 1125 'AWS-RunPowerShellScript',
1126 1126 {
1127 1127 'commands': WINDOWS_BOOTSTRAP_POWERSHELL.split('\n'),
1128 1128 },
1129 1129 )
1130 1130
1131 1131 # Reboot so all updates are fully applied.
1132 1132 #
1133 1133 # We don't use instance.reboot() here because it is asynchronous and
1134 1134 # we don't know when exactly the instance has rebooted. It could take
1135 1135 # a while to stop and we may start trying to interact with the instance
1136 1136 # before it has rebooted.
1137 1137 print('rebooting instance %s' % instance.id)
1138 1138 instance.stop()
1139 1139 ec2client.get_waiter('instance_stopped').wait(
1140 1140 InstanceIds=[instance.id],
1141 1141 WaiterConfig={
1142 1142 'Delay': 5,
1143 1143 })
1144 1144
1145 1145 instance.start()
1146 1146 wait_for_ip_addresses([instance])
1147 1147
1148 1148 # There is a race condition here between the User Data PS script running
1149 1149 # and us connecting to WinRM. This can manifest as
1150 1150 # "AuthorizationManager check failed" failures during run_powershell().
1151 1151 # TODO figure out a workaround.
1152 1152
1153 1153 print('waiting for Windows Remote Management to come back...')
1154 1154 client = wait_for_winrm(instance.public_ip_address, 'Administrator',
1155 1155 c.automation.default_password())
1156 1156 print('established WinRM connection to %s' % instance.id)
1157 1157 instance.winrm_client = client
1158 1158
1159 1159 print('bootstrapping instance...')
1160 1160 run_powershell(instance.winrm_client, '\n'.join(commands))
1161 1161
1162 1162 print('bootstrap completed; stopping %s to create image' % instance.id)
1163 1163 return create_ami_from_instance(ec2client, instance, name,
1164 1164 'Mercurial Windows development environment',
1165 1165 fingerprint)
1166 1166
1167 1167
1168 1168 @contextlib.contextmanager
1169 1169 def temporary_windows_dev_instances(c: AWSConnection, image, instance_type,
1170 1170 prefix='hg-', disable_antivirus=False):
1171 1171 """Create a temporary Windows development EC2 instance.
1172 1172
1173 1173 Context manager resolves to the list of ``EC2.Instance`` that were created.
1174 1174 """
1175 1175 config = {
1176 1176 'BlockDeviceMappings': [
1177 1177 {
1178 1178 'DeviceName': '/dev/sda1',
1179 1179 'Ebs': {
1180 1180 'DeleteOnTermination': True,
1181 1181 'VolumeSize': 32,
1182 1182 'VolumeType': 'gp2',
1183 1183 },
1184 1184 }
1185 1185 ],
1186 1186 'ImageId': image.id,
1187 1187 'InstanceInitiatedShutdownBehavior': 'stop',
1188 1188 'InstanceType': instance_type,
1189 1189 'KeyName': '%sautomation' % prefix,
1190 1190 'MaxCount': 1,
1191 1191 'MinCount': 1,
1192 1192 'SecurityGroupIds': [c.security_groups['windows-dev-1'].id],
1193 1193 }
1194 1194
1195 1195 with create_temp_windows_ec2_instances(c, config) as instances:
1196 1196 if disable_antivirus:
1197 1197 for instance in instances:
1198 1198 run_powershell(
1199 1199 instance.winrm_client,
1200 1200 'Set-MpPreference -DisableRealtimeMonitoring $true')
1201 1201
1202 1202 yield instances
General Comments 0
You need to be logged in to leave comments. Login now