##// END OF EJS Templates
automation: avoid '~' in the temp directory on Windows...
Matt Harbison -
r43729:a77338d2 stable
parent child Browse files
Show More
@@ -1,1262 +1,1266
1 1 # aws.py - Automation code for Amazon Web Services
2 2 #
3 3 # Copyright 2019 Gregory Szorc <gregory.szorc@gmail.com>
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 # no-check-code because Python 3 native.
9 9
10 10 import contextlib
11 11 import copy
12 12 import hashlib
13 13 import json
14 14 import os
15 15 import pathlib
16 16 import subprocess
17 17 import time
18 18
19 19 import boto3
20 20 import botocore.exceptions
21 21
22 22 from .linux import BOOTSTRAP_DEBIAN
23 23 from .ssh import (
24 24 exec_command as ssh_exec_command,
25 25 wait_for_ssh,
26 26 )
27 27 from .winrm import (
28 28 run_powershell,
29 29 wait_for_winrm,
30 30 )
31 31
32 32
33 33 SOURCE_ROOT = pathlib.Path(
34 34 os.path.abspath(__file__)
35 35 ).parent.parent.parent.parent
36 36
37 37 INSTALL_WINDOWS_DEPENDENCIES = (
38 38 SOURCE_ROOT / 'contrib' / 'install-windows-dependencies.ps1'
39 39 )
40 40
41 41
42 42 INSTANCE_TYPES_WITH_STORAGE = {
43 43 'c5d',
44 44 'd2',
45 45 'h1',
46 46 'i3',
47 47 'm5ad',
48 48 'm5d',
49 49 'r5d',
50 50 'r5ad',
51 51 'x1',
52 52 'z1d',
53 53 }
54 54
55 55
56 56 AMAZON_ACCOUNT_ID = '801119661308'
57 57 DEBIAN_ACCOUNT_ID = '379101102735'
58 58 DEBIAN_ACCOUNT_ID_2 = '136693071363'
59 59 UBUNTU_ACCOUNT_ID = '099720109477'
60 60
61 61
62 62 WINDOWS_BASE_IMAGE_NAME = 'Windows_Server-2019-English-Full-Base-2019.07.12'
63 63
64 64
65 65 KEY_PAIRS = {
66 66 'automation',
67 67 }
68 68
69 69
70 70 SECURITY_GROUPS = {
71 71 'linux-dev-1': {
72 72 'description': 'Mercurial Linux instances that perform build/test automation',
73 73 'ingress': [
74 74 {
75 75 'FromPort': 22,
76 76 'ToPort': 22,
77 77 'IpProtocol': 'tcp',
78 78 'IpRanges': [
79 79 {
80 80 'CidrIp': '0.0.0.0/0',
81 81 'Description': 'SSH from entire Internet',
82 82 },
83 83 ],
84 84 },
85 85 ],
86 86 },
87 87 'windows-dev-1': {
88 88 'description': 'Mercurial Windows instances that perform build automation',
89 89 'ingress': [
90 90 {
91 91 'FromPort': 22,
92 92 'ToPort': 22,
93 93 'IpProtocol': 'tcp',
94 94 'IpRanges': [
95 95 {
96 96 'CidrIp': '0.0.0.0/0',
97 97 'Description': 'SSH from entire Internet',
98 98 },
99 99 ],
100 100 },
101 101 {
102 102 'FromPort': 3389,
103 103 'ToPort': 3389,
104 104 'IpProtocol': 'tcp',
105 105 'IpRanges': [
106 106 {
107 107 'CidrIp': '0.0.0.0/0',
108 108 'Description': 'RDP from entire Internet',
109 109 },
110 110 ],
111 111 },
112 112 {
113 113 'FromPort': 5985,
114 114 'ToPort': 5986,
115 115 'IpProtocol': 'tcp',
116 116 'IpRanges': [
117 117 {
118 118 'CidrIp': '0.0.0.0/0',
119 119 'Description': 'PowerShell Remoting (Windows Remote Management)',
120 120 },
121 121 ],
122 122 },
123 123 ],
124 124 },
125 125 }
126 126
127 127
128 128 IAM_ROLES = {
129 129 'ephemeral-ec2-role-1': {
130 130 'description': 'Mercurial temporary EC2 instances',
131 131 'policy_arns': [
132 132 'arn:aws:iam::aws:policy/service-role/AmazonEC2RoleforSSM',
133 133 ],
134 134 },
135 135 }
136 136
137 137
138 138 ASSUME_ROLE_POLICY_DOCUMENT = '''
139 139 {
140 140 "Version": "2012-10-17",
141 141 "Statement": [
142 142 {
143 143 "Effect": "Allow",
144 144 "Principal": {
145 145 "Service": "ec2.amazonaws.com"
146 146 },
147 147 "Action": "sts:AssumeRole"
148 148 }
149 149 ]
150 150 }
151 151 '''.strip()
152 152
153 153
154 154 IAM_INSTANCE_PROFILES = {
155 155 'ephemeral-ec2-1': {'roles': ['ephemeral-ec2-role-1',],}
156 156 }
157 157
158 158
159 159 # User Data for Windows EC2 instance. Mainly used to set the password
160 160 # and configure WinRM.
161 161 # Inspired by the User Data script used by Packer
162 162 # (from https://www.packer.io/intro/getting-started/build-image.html).
163 163 WINDOWS_USER_DATA = r'''
164 164 <powershell>
165 165
166 166 # TODO enable this once we figure out what is failing.
167 167 #$ErrorActionPreference = "stop"
168 168
169 169 # Set administrator password
170 170 net user Administrator "%s"
171 171 wmic useraccount where "name='Administrator'" set PasswordExpires=FALSE
172 172
173 173 # First, make sure WinRM can't be connected to
174 174 netsh advfirewall firewall set rule name="Windows Remote Management (HTTP-In)" new enable=yes action=block
175 175
176 176 # Delete any existing WinRM listeners
177 177 winrm delete winrm/config/listener?Address=*+Transport=HTTP 2>$Null
178 178 winrm delete winrm/config/listener?Address=*+Transport=HTTPS 2>$Null
179 179
180 180 # Create a new WinRM listener and configure
181 181 winrm create winrm/config/listener?Address=*+Transport=HTTP
182 182 winrm set winrm/config/winrs '@{MaxMemoryPerShellMB="0"}'
183 183 winrm set winrm/config '@{MaxTimeoutms="7200000"}'
184 184 winrm set winrm/config/service '@{AllowUnencrypted="true"}'
185 185 winrm set winrm/config/service '@{MaxConcurrentOperationsPerUser="12000"}'
186 186 winrm set winrm/config/service/auth '@{Basic="true"}'
187 187 winrm set winrm/config/client/auth '@{Basic="true"}'
188 188
189 189 # Configure UAC to allow privilege elevation in remote shells
190 190 $Key = 'HKLM:\SOFTWARE\Microsoft\Windows\CurrentVersion\Policies\System'
191 191 $Setting = 'LocalAccountTokenFilterPolicy'
192 192 Set-ItemProperty -Path $Key -Name $Setting -Value 1 -Force
193 193
194 # Avoid long usernames in the temp directory path because the '~' causes extra quoting in ssh output
195 [System.Environment]::SetEnvironmentVariable('TMP', 'C:\Temp', [System.EnvironmentVariableTarget]::User)
196 [System.Environment]::SetEnvironmentVariable('TEMP', 'C:\Temp', [System.EnvironmentVariableTarget]::User)
197
194 198 # Configure and restart the WinRM Service; Enable the required firewall exception
195 199 Stop-Service -Name WinRM
196 200 Set-Service -Name WinRM -StartupType Automatic
197 201 netsh advfirewall firewall set rule name="Windows Remote Management (HTTP-In)" new action=allow localip=any remoteip=any
198 202 Start-Service -Name WinRM
199 203
200 204 # Disable firewall on private network interfaces so prompts don't appear.
201 205 Set-NetFirewallProfile -Name private -Enabled false
202 206 </powershell>
203 207 '''.lstrip()
204 208
205 209
206 210 WINDOWS_BOOTSTRAP_POWERSHELL = '''
207 211 Write-Output "installing PowerShell dependencies"
208 212 Install-PackageProvider -Name NuGet -MinimumVersion 2.8.5.201 -Force
209 213 Set-PSRepository -Name PSGallery -InstallationPolicy Trusted
210 214 Install-Module -Name OpenSSHUtils -RequiredVersion 0.0.2.0
211 215
212 216 Write-Output "installing OpenSSL server"
213 217 Add-WindowsCapability -Online -Name OpenSSH.Server~~~~0.0.1.0
214 218 # Various tools will attempt to use older versions of .NET. So we enable
215 219 # the feature that provides them so it doesn't have to be auto-enabled
216 220 # later.
217 221 Write-Output "enabling .NET Framework feature"
218 222 Install-WindowsFeature -Name Net-Framework-Core
219 223 '''
220 224
221 225
222 226 class AWSConnection:
223 227 """Manages the state of a connection with AWS."""
224 228
225 229 def __init__(self, automation, region: str, ensure_ec2_state: bool = True):
226 230 self.automation = automation
227 231 self.local_state_path = automation.state_path
228 232
229 233 self.prefix = 'hg-'
230 234
231 235 self.session = boto3.session.Session(region_name=region)
232 236 self.ec2client = self.session.client('ec2')
233 237 self.ec2resource = self.session.resource('ec2')
234 238 self.iamclient = self.session.client('iam')
235 239 self.iamresource = self.session.resource('iam')
236 240 self.security_groups = {}
237 241
238 242 if ensure_ec2_state:
239 243 ensure_key_pairs(automation.state_path, self.ec2resource)
240 244 self.security_groups = ensure_security_groups(self.ec2resource)
241 245 ensure_iam_state(self.iamclient, self.iamresource)
242 246
243 247 def key_pair_path_private(self, name):
244 248 """Path to a key pair private key file."""
245 249 return self.local_state_path / 'keys' / ('keypair-%s' % name)
246 250
247 251 def key_pair_path_public(self, name):
248 252 return self.local_state_path / 'keys' / ('keypair-%s.pub' % name)
249 253
250 254
251 255 def rsa_key_fingerprint(p: pathlib.Path):
252 256 """Compute the fingerprint of an RSA private key."""
253 257
254 258 # TODO use rsa package.
255 259 res = subprocess.run(
256 260 [
257 261 'openssl',
258 262 'pkcs8',
259 263 '-in',
260 264 str(p),
261 265 '-nocrypt',
262 266 '-topk8',
263 267 '-outform',
264 268 'DER',
265 269 ],
266 270 capture_output=True,
267 271 check=True,
268 272 )
269 273
270 274 sha1 = hashlib.sha1(res.stdout).hexdigest()
271 275 return ':'.join(a + b for a, b in zip(sha1[::2], sha1[1::2]))
272 276
273 277
274 278 def ensure_key_pairs(state_path: pathlib.Path, ec2resource, prefix='hg-'):
275 279 remote_existing = {}
276 280
277 281 for kpi in ec2resource.key_pairs.all():
278 282 if kpi.name.startswith(prefix):
279 283 remote_existing[kpi.name[len(prefix) :]] = kpi.key_fingerprint
280 284
281 285 # Validate that we have these keys locally.
282 286 key_path = state_path / 'keys'
283 287 key_path.mkdir(exist_ok=True, mode=0o700)
284 288
285 289 def remove_remote(name):
286 290 print('deleting key pair %s' % name)
287 291 key = ec2resource.KeyPair(name)
288 292 key.delete()
289 293
290 294 def remove_local(name):
291 295 pub_full = key_path / ('keypair-%s.pub' % name)
292 296 priv_full = key_path / ('keypair-%s' % name)
293 297
294 298 print('removing %s' % pub_full)
295 299 pub_full.unlink()
296 300 print('removing %s' % priv_full)
297 301 priv_full.unlink()
298 302
299 303 local_existing = {}
300 304
301 305 for f in sorted(os.listdir(key_path)):
302 306 if not f.startswith('keypair-') or not f.endswith('.pub'):
303 307 continue
304 308
305 309 name = f[len('keypair-') : -len('.pub')]
306 310
307 311 pub_full = key_path / f
308 312 priv_full = key_path / ('keypair-%s' % name)
309 313
310 314 with open(pub_full, 'r', encoding='ascii') as fh:
311 315 data = fh.read()
312 316
313 317 if not data.startswith('ssh-rsa '):
314 318 print(
315 319 'unexpected format for key pair file: %s; removing' % pub_full
316 320 )
317 321 pub_full.unlink()
318 322 priv_full.unlink()
319 323 continue
320 324
321 325 local_existing[name] = rsa_key_fingerprint(priv_full)
322 326
323 327 for name in sorted(set(remote_existing) | set(local_existing)):
324 328 if name not in local_existing:
325 329 actual = '%s%s' % (prefix, name)
326 330 print('remote key %s does not exist locally' % name)
327 331 remove_remote(actual)
328 332 del remote_existing[name]
329 333
330 334 elif name not in remote_existing:
331 335 print('local key %s does not exist remotely' % name)
332 336 remove_local(name)
333 337 del local_existing[name]
334 338
335 339 elif remote_existing[name] != local_existing[name]:
336 340 print(
337 341 'key fingerprint mismatch for %s; '
338 342 'removing from local and remote' % name
339 343 )
340 344 remove_local(name)
341 345 remove_remote('%s%s' % (prefix, name))
342 346 del local_existing[name]
343 347 del remote_existing[name]
344 348
345 349 missing = KEY_PAIRS - set(remote_existing)
346 350
347 351 for name in sorted(missing):
348 352 actual = '%s%s' % (prefix, name)
349 353 print('creating key pair %s' % actual)
350 354
351 355 priv_full = key_path / ('keypair-%s' % name)
352 356 pub_full = key_path / ('keypair-%s.pub' % name)
353 357
354 358 kp = ec2resource.create_key_pair(KeyName=actual)
355 359
356 360 with priv_full.open('w', encoding='ascii') as fh:
357 361 fh.write(kp.key_material)
358 362 fh.write('\n')
359 363
360 364 priv_full.chmod(0o0600)
361 365
362 366 # SSH public key can be extracted via `ssh-keygen`.
363 367 with pub_full.open('w', encoding='ascii') as fh:
364 368 subprocess.run(
365 369 ['ssh-keygen', '-y', '-f', str(priv_full)],
366 370 stdout=fh,
367 371 check=True,
368 372 )
369 373
370 374 pub_full.chmod(0o0600)
371 375
372 376
373 377 def delete_instance_profile(profile):
374 378 for role in profile.roles:
375 379 print(
376 380 'removing role %s from instance profile %s'
377 381 % (role.name, profile.name)
378 382 )
379 383 profile.remove_role(RoleName=role.name)
380 384
381 385 print('deleting instance profile %s' % profile.name)
382 386 profile.delete()
383 387
384 388
385 389 def ensure_iam_state(iamclient, iamresource, prefix='hg-'):
386 390 """Ensure IAM state is in sync with our canonical definition."""
387 391
388 392 remote_profiles = {}
389 393
390 394 for profile in iamresource.instance_profiles.all():
391 395 if profile.name.startswith(prefix):
392 396 remote_profiles[profile.name[len(prefix) :]] = profile
393 397
394 398 for name in sorted(set(remote_profiles) - set(IAM_INSTANCE_PROFILES)):
395 399 delete_instance_profile(remote_profiles[name])
396 400 del remote_profiles[name]
397 401
398 402 remote_roles = {}
399 403
400 404 for role in iamresource.roles.all():
401 405 if role.name.startswith(prefix):
402 406 remote_roles[role.name[len(prefix) :]] = role
403 407
404 408 for name in sorted(set(remote_roles) - set(IAM_ROLES)):
405 409 role = remote_roles[name]
406 410
407 411 print('removing role %s' % role.name)
408 412 role.delete()
409 413 del remote_roles[name]
410 414
411 415 # We've purged remote state that doesn't belong. Create missing
412 416 # instance profiles and roles.
413 417 for name in sorted(set(IAM_INSTANCE_PROFILES) - set(remote_profiles)):
414 418 actual = '%s%s' % (prefix, name)
415 419 print('creating IAM instance profile %s' % actual)
416 420
417 421 profile = iamresource.create_instance_profile(
418 422 InstanceProfileName=actual
419 423 )
420 424 remote_profiles[name] = profile
421 425
422 426 waiter = iamclient.get_waiter('instance_profile_exists')
423 427 waiter.wait(InstanceProfileName=actual)
424 428 print('IAM instance profile %s is available' % actual)
425 429
426 430 for name in sorted(set(IAM_ROLES) - set(remote_roles)):
427 431 entry = IAM_ROLES[name]
428 432
429 433 actual = '%s%s' % (prefix, name)
430 434 print('creating IAM role %s' % actual)
431 435
432 436 role = iamresource.create_role(
433 437 RoleName=actual,
434 438 Description=entry['description'],
435 439 AssumeRolePolicyDocument=ASSUME_ROLE_POLICY_DOCUMENT,
436 440 )
437 441
438 442 waiter = iamclient.get_waiter('role_exists')
439 443 waiter.wait(RoleName=actual)
440 444 print('IAM role %s is available' % actual)
441 445
442 446 remote_roles[name] = role
443 447
444 448 for arn in entry['policy_arns']:
445 449 print('attaching policy %s to %s' % (arn, role.name))
446 450 role.attach_policy(PolicyArn=arn)
447 451
448 452 # Now reconcile state of profiles.
449 453 for name, meta in sorted(IAM_INSTANCE_PROFILES.items()):
450 454 profile = remote_profiles[name]
451 455 wanted = {'%s%s' % (prefix, role) for role in meta['roles']}
452 456 have = {role.name for role in profile.roles}
453 457
454 458 for role in sorted(have - wanted):
455 459 print('removing role %s from %s' % (role, profile.name))
456 460 profile.remove_role(RoleName=role)
457 461
458 462 for role in sorted(wanted - have):
459 463 print('adding role %s to %s' % (role, profile.name))
460 464 profile.add_role(RoleName=role)
461 465
462 466
463 467 def find_image(ec2resource, owner_id, name):
464 468 """Find an AMI by its owner ID and name."""
465 469
466 470 images = ec2resource.images.filter(
467 471 Filters=[
468 472 {'Name': 'owner-id', 'Values': [owner_id],},
469 473 {'Name': 'state', 'Values': ['available'],},
470 474 {'Name': 'image-type', 'Values': ['machine'],},
471 475 {'Name': 'name', 'Values': [name],},
472 476 ]
473 477 )
474 478
475 479 for image in images:
476 480 return image
477 481
478 482 raise Exception('unable to find image for %s' % name)
479 483
480 484
481 485 def ensure_security_groups(ec2resource, prefix='hg-'):
482 486 """Ensure all necessary Mercurial security groups are present.
483 487
484 488 All security groups are prefixed with ``hg-`` by default. Any security
485 489 groups having this prefix but aren't in our list are deleted.
486 490 """
487 491 existing = {}
488 492
489 493 for group in ec2resource.security_groups.all():
490 494 if group.group_name.startswith(prefix):
491 495 existing[group.group_name[len(prefix) :]] = group
492 496
493 497 purge = set(existing) - set(SECURITY_GROUPS)
494 498
495 499 for name in sorted(purge):
496 500 group = existing[name]
497 501 print('removing legacy security group: %s' % group.group_name)
498 502 group.delete()
499 503
500 504 security_groups = {}
501 505
502 506 for name, group in sorted(SECURITY_GROUPS.items()):
503 507 if name in existing:
504 508 security_groups[name] = existing[name]
505 509 continue
506 510
507 511 actual = '%s%s' % (prefix, name)
508 512 print('adding security group %s' % actual)
509 513
510 514 group_res = ec2resource.create_security_group(
511 515 Description=group['description'], GroupName=actual,
512 516 )
513 517
514 518 group_res.authorize_ingress(IpPermissions=group['ingress'],)
515 519
516 520 security_groups[name] = group_res
517 521
518 522 return security_groups
519 523
520 524
521 525 def terminate_ec2_instances(ec2resource, prefix='hg-'):
522 526 """Terminate all EC2 instances managed by us."""
523 527 waiting = []
524 528
525 529 for instance in ec2resource.instances.all():
526 530 if instance.state['Name'] == 'terminated':
527 531 continue
528 532
529 533 for tag in instance.tags or []:
530 534 if tag['Key'] == 'Name' and tag['Value'].startswith(prefix):
531 535 print('terminating %s' % instance.id)
532 536 instance.terminate()
533 537 waiting.append(instance)
534 538
535 539 for instance in waiting:
536 540 instance.wait_until_terminated()
537 541
538 542
539 543 def remove_resources(c, prefix='hg-'):
540 544 """Purge all of our resources in this EC2 region."""
541 545 ec2resource = c.ec2resource
542 546 iamresource = c.iamresource
543 547
544 548 terminate_ec2_instances(ec2resource, prefix=prefix)
545 549
546 550 for image in ec2resource.images.filter(Owners=['self']):
547 551 if image.name.startswith(prefix):
548 552 remove_ami(ec2resource, image)
549 553
550 554 for group in ec2resource.security_groups.all():
551 555 if group.group_name.startswith(prefix):
552 556 print('removing security group %s' % group.group_name)
553 557 group.delete()
554 558
555 559 for profile in iamresource.instance_profiles.all():
556 560 if profile.name.startswith(prefix):
557 561 delete_instance_profile(profile)
558 562
559 563 for role in iamresource.roles.all():
560 564 if role.name.startswith(prefix):
561 565 for p in role.attached_policies.all():
562 566 print('detaching policy %s from %s' % (p.arn, role.name))
563 567 role.detach_policy(PolicyArn=p.arn)
564 568
565 569 print('removing role %s' % role.name)
566 570 role.delete()
567 571
568 572
569 573 def wait_for_ip_addresses(instances):
570 574 """Wait for the public IP addresses of an iterable of instances."""
571 575 for instance in instances:
572 576 while True:
573 577 if not instance.public_ip_address:
574 578 time.sleep(2)
575 579 instance.reload()
576 580 continue
577 581
578 582 print(
579 583 'public IP address for %s: %s'
580 584 % (instance.id, instance.public_ip_address)
581 585 )
582 586 break
583 587
584 588
585 589 def remove_ami(ec2resource, image):
586 590 """Remove an AMI and its underlying snapshots."""
587 591 snapshots = []
588 592
589 593 for device in image.block_device_mappings:
590 594 if 'Ebs' in device:
591 595 snapshots.append(ec2resource.Snapshot(device['Ebs']['SnapshotId']))
592 596
593 597 print('deregistering %s' % image.id)
594 598 image.deregister()
595 599
596 600 for snapshot in snapshots:
597 601 print('deleting snapshot %s' % snapshot.id)
598 602 snapshot.delete()
599 603
600 604
601 605 def wait_for_ssm(ssmclient, instances):
602 606 """Wait for SSM to come online for an iterable of instance IDs."""
603 607 while True:
604 608 res = ssmclient.describe_instance_information(
605 609 Filters=[
606 610 {'Key': 'InstanceIds', 'Values': [i.id for i in instances],},
607 611 ],
608 612 )
609 613
610 614 available = len(res['InstanceInformationList'])
611 615 wanted = len(instances)
612 616
613 617 print('%d/%d instances available in SSM' % (available, wanted))
614 618
615 619 if available == wanted:
616 620 return
617 621
618 622 time.sleep(2)
619 623
620 624
621 625 def run_ssm_command(ssmclient, instances, document_name, parameters):
622 626 """Run a PowerShell script on an EC2 instance."""
623 627
624 628 res = ssmclient.send_command(
625 629 InstanceIds=[i.id for i in instances],
626 630 DocumentName=document_name,
627 631 Parameters=parameters,
628 632 CloudWatchOutputConfig={'CloudWatchOutputEnabled': True,},
629 633 )
630 634
631 635 command_id = res['Command']['CommandId']
632 636
633 637 for instance in instances:
634 638 while True:
635 639 try:
636 640 res = ssmclient.get_command_invocation(
637 641 CommandId=command_id, InstanceId=instance.id,
638 642 )
639 643 except botocore.exceptions.ClientError as e:
640 644 if e.response['Error']['Code'] == 'InvocationDoesNotExist':
641 645 print('could not find SSM command invocation; waiting')
642 646 time.sleep(1)
643 647 continue
644 648 else:
645 649 raise
646 650
647 651 if res['Status'] == 'Success':
648 652 break
649 653 elif res['Status'] in ('Pending', 'InProgress', 'Delayed'):
650 654 time.sleep(2)
651 655 else:
652 656 raise Exception(
653 657 'command failed on %s: %s' % (instance.id, res['Status'])
654 658 )
655 659
656 660
657 661 @contextlib.contextmanager
658 662 def temporary_ec2_instances(ec2resource, config):
659 663 """Create temporary EC2 instances.
660 664
661 665 This is a proxy to ``ec2client.run_instances(**config)`` that takes care of
662 666 managing the lifecycle of the instances.
663 667
664 668 When the context manager exits, the instances are terminated.
665 669
666 670 The context manager evaluates to the list of data structures
667 671 describing each created instance. The instances may not be available
668 672 for work immediately: it is up to the caller to wait for the instance
669 673 to start responding.
670 674 """
671 675
672 676 ids = None
673 677
674 678 try:
675 679 res = ec2resource.create_instances(**config)
676 680
677 681 ids = [i.id for i in res]
678 682 print('started instances: %s' % ' '.join(ids))
679 683
680 684 yield res
681 685 finally:
682 686 if ids:
683 687 print('terminating instances: %s' % ' '.join(ids))
684 688 for instance in res:
685 689 instance.terminate()
686 690 print('terminated %d instances' % len(ids))
687 691
688 692
689 693 @contextlib.contextmanager
690 694 def create_temp_windows_ec2_instances(
691 695 c: AWSConnection, config, bootstrap: bool = False
692 696 ):
693 697 """Create temporary Windows EC2 instances.
694 698
695 699 This is a higher-level wrapper around ``create_temp_ec2_instances()`` that
696 700 configures the Windows instance for Windows Remote Management. The emitted
697 701 instances will have a ``winrm_client`` attribute containing a
698 702 ``pypsrp.client.Client`` instance bound to the instance.
699 703 """
700 704 if 'IamInstanceProfile' in config:
701 705 raise ValueError('IamInstanceProfile cannot be provided in config')
702 706 if 'UserData' in config:
703 707 raise ValueError('UserData cannot be provided in config')
704 708
705 709 password = c.automation.default_password()
706 710
707 711 config = copy.deepcopy(config)
708 712 config['IamInstanceProfile'] = {
709 713 'Name': 'hg-ephemeral-ec2-1',
710 714 }
711 715 config.setdefault('TagSpecifications', []).append(
712 716 {
713 717 'ResourceType': 'instance',
714 718 'Tags': [{'Key': 'Name', 'Value': 'hg-temp-windows'}],
715 719 }
716 720 )
717 721
718 722 if bootstrap:
719 723 config['UserData'] = WINDOWS_USER_DATA % password
720 724
721 725 with temporary_ec2_instances(c.ec2resource, config) as instances:
722 726 wait_for_ip_addresses(instances)
723 727
724 728 print('waiting for Windows Remote Management service...')
725 729
726 730 for instance in instances:
727 731 client = wait_for_winrm(
728 732 instance.public_ip_address, 'Administrator', password
729 733 )
730 734 print('established WinRM connection to %s' % instance.id)
731 735 instance.winrm_client = client
732 736
733 737 yield instances
734 738
735 739
736 740 def resolve_fingerprint(fingerprint):
737 741 fingerprint = json.dumps(fingerprint, sort_keys=True)
738 742 return hashlib.sha256(fingerprint.encode('utf-8')).hexdigest()
739 743
740 744
741 745 def find_and_reconcile_image(ec2resource, name, fingerprint):
742 746 """Attempt to find an existing EC2 AMI with a name and fingerprint.
743 747
744 748 If an image with the specified fingerprint is found, it is returned.
745 749 Otherwise None is returned.
746 750
747 751 Existing images for the specified name that don't have the specified
748 752 fingerprint or are missing required metadata or deleted.
749 753 """
750 754 # Find existing AMIs with this name and delete the ones that are invalid.
751 755 # Store a reference to a good image so it can be returned one the
752 756 # image state is reconciled.
753 757 images = ec2resource.images.filter(
754 758 Filters=[{'Name': 'name', 'Values': [name]}]
755 759 )
756 760
757 761 existing_image = None
758 762
759 763 for image in images:
760 764 if image.tags is None:
761 765 print(
762 766 'image %s for %s lacks required tags; removing'
763 767 % (image.id, image.name)
764 768 )
765 769 remove_ami(ec2resource, image)
766 770 else:
767 771 tags = {t['Key']: t['Value'] for t in image.tags}
768 772
769 773 if tags.get('HGIMAGEFINGERPRINT') == fingerprint:
770 774 existing_image = image
771 775 else:
772 776 print(
773 777 'image %s for %s has wrong fingerprint; removing'
774 778 % (image.id, image.name)
775 779 )
776 780 remove_ami(ec2resource, image)
777 781
778 782 return existing_image
779 783
780 784
781 785 def create_ami_from_instance(
782 786 ec2client, instance, name, description, fingerprint
783 787 ):
784 788 """Create an AMI from a running instance.
785 789
786 790 Returns the ``ec2resource.Image`` representing the created AMI.
787 791 """
788 792 instance.stop()
789 793
790 794 ec2client.get_waiter('instance_stopped').wait(
791 795 InstanceIds=[instance.id], WaiterConfig={'Delay': 5,}
792 796 )
793 797 print('%s is stopped' % instance.id)
794 798
795 799 image = instance.create_image(Name=name, Description=description,)
796 800
797 801 image.create_tags(
798 802 Tags=[{'Key': 'HGIMAGEFINGERPRINT', 'Value': fingerprint,},]
799 803 )
800 804
801 805 print('waiting for image %s' % image.id)
802 806
803 807 ec2client.get_waiter('image_available').wait(ImageIds=[image.id],)
804 808
805 809 print('image %s available as %s' % (image.id, image.name))
806 810
807 811 return image
808 812
809 813
810 814 def ensure_linux_dev_ami(c: AWSConnection, distro='debian10', prefix='hg-'):
811 815 """Ensures a Linux development AMI is available and up-to-date.
812 816
813 817 Returns an ``ec2.Image`` of either an existing AMI or a newly-built one.
814 818 """
815 819 ec2client = c.ec2client
816 820 ec2resource = c.ec2resource
817 821
818 822 name = '%s%s-%s' % (prefix, 'linux-dev', distro)
819 823
820 824 if distro == 'debian9':
821 825 image = find_image(
822 826 ec2resource,
823 827 DEBIAN_ACCOUNT_ID,
824 828 'debian-stretch-hvm-x86_64-gp2-2019-09-08-17994',
825 829 )
826 830 ssh_username = 'admin'
827 831 elif distro == 'debian10':
828 832 image = find_image(
829 833 ec2resource, DEBIAN_ACCOUNT_ID_2, 'debian-10-amd64-20190909-10',
830 834 )
831 835 ssh_username = 'admin'
832 836 elif distro == 'ubuntu18.04':
833 837 image = find_image(
834 838 ec2resource,
835 839 UBUNTU_ACCOUNT_ID,
836 840 'ubuntu/images/hvm-ssd/ubuntu-bionic-18.04-amd64-server-20190918',
837 841 )
838 842 ssh_username = 'ubuntu'
839 843 elif distro == 'ubuntu19.04':
840 844 image = find_image(
841 845 ec2resource,
842 846 UBUNTU_ACCOUNT_ID,
843 847 'ubuntu/images/hvm-ssd/ubuntu-disco-19.04-amd64-server-20190918',
844 848 )
845 849 ssh_username = 'ubuntu'
846 850 else:
847 851 raise ValueError('unsupported Linux distro: %s' % distro)
848 852
849 853 config = {
850 854 'BlockDeviceMappings': [
851 855 {
852 856 'DeviceName': image.block_device_mappings[0]['DeviceName'],
853 857 'Ebs': {
854 858 'DeleteOnTermination': True,
855 859 'VolumeSize': 10,
856 860 'VolumeType': 'gp2',
857 861 },
858 862 },
859 863 ],
860 864 'EbsOptimized': True,
861 865 'ImageId': image.id,
862 866 'InstanceInitiatedShutdownBehavior': 'stop',
863 867 # 8 VCPUs for compiling Python.
864 868 'InstanceType': 't3.2xlarge',
865 869 'KeyName': '%sautomation' % prefix,
866 870 'MaxCount': 1,
867 871 'MinCount': 1,
868 872 'SecurityGroupIds': [c.security_groups['linux-dev-1'].id],
869 873 }
870 874
871 875 requirements2_path = (
872 876 pathlib.Path(__file__).parent.parent / 'linux-requirements-py2.txt'
873 877 )
874 878 requirements3_path = (
875 879 pathlib.Path(__file__).parent.parent / 'linux-requirements-py3.txt'
876 880 )
877 881 with requirements2_path.open('r', encoding='utf-8') as fh:
878 882 requirements2 = fh.read()
879 883 with requirements3_path.open('r', encoding='utf-8') as fh:
880 884 requirements3 = fh.read()
881 885
882 886 # Compute a deterministic fingerprint to determine whether image needs to
883 887 # be regenerated.
884 888 fingerprint = resolve_fingerprint(
885 889 {
886 890 'instance_config': config,
887 891 'bootstrap_script': BOOTSTRAP_DEBIAN,
888 892 'requirements_py2': requirements2,
889 893 'requirements_py3': requirements3,
890 894 }
891 895 )
892 896
893 897 existing_image = find_and_reconcile_image(ec2resource, name, fingerprint)
894 898
895 899 if existing_image:
896 900 return existing_image
897 901
898 902 print('no suitable %s image found; creating one...' % name)
899 903
900 904 with temporary_ec2_instances(ec2resource, config) as instances:
901 905 wait_for_ip_addresses(instances)
902 906
903 907 instance = instances[0]
904 908
905 909 client = wait_for_ssh(
906 910 instance.public_ip_address,
907 911 22,
908 912 username=ssh_username,
909 913 key_filename=str(c.key_pair_path_private('automation')),
910 914 )
911 915
912 916 home = '/home/%s' % ssh_username
913 917
914 918 with client:
915 919 print('connecting to SSH server')
916 920 sftp = client.open_sftp()
917 921
918 922 print('uploading bootstrap files')
919 923 with sftp.open('%s/bootstrap' % home, 'wb') as fh:
920 924 fh.write(BOOTSTRAP_DEBIAN)
921 925 fh.chmod(0o0700)
922 926
923 927 with sftp.open('%s/requirements-py2.txt' % home, 'wb') as fh:
924 928 fh.write(requirements2)
925 929 fh.chmod(0o0700)
926 930
927 931 with sftp.open('%s/requirements-py3.txt' % home, 'wb') as fh:
928 932 fh.write(requirements3)
929 933 fh.chmod(0o0700)
930 934
931 935 print('executing bootstrap')
932 936 chan, stdin, stdout = ssh_exec_command(
933 937 client, '%s/bootstrap' % home
934 938 )
935 939 stdin.close()
936 940
937 941 for line in stdout:
938 942 print(line, end='')
939 943
940 944 res = chan.recv_exit_status()
941 945 if res:
942 946 raise Exception('non-0 exit from bootstrap: %d' % res)
943 947
944 948 print(
945 949 'bootstrap completed; stopping %s to create %s'
946 950 % (instance.id, name)
947 951 )
948 952
949 953 return create_ami_from_instance(
950 954 ec2client,
951 955 instance,
952 956 name,
953 957 'Mercurial Linux development environment',
954 958 fingerprint,
955 959 )
956 960
957 961
958 962 @contextlib.contextmanager
959 963 def temporary_linux_dev_instances(
960 964 c: AWSConnection,
961 965 image,
962 966 instance_type,
963 967 prefix='hg-',
964 968 ensure_extra_volume=False,
965 969 ):
966 970 """Create temporary Linux development EC2 instances.
967 971
968 972 Context manager resolves to a list of ``ec2.Instance`` that were created
969 973 and are running.
970 974
971 975 ``ensure_extra_volume`` can be set to ``True`` to require that instances
972 976 have a 2nd storage volume available other than the primary AMI volume.
973 977 For instance types with instance storage, this does nothing special.
974 978 But for instance types without instance storage, an additional EBS volume
975 979 will be added to the instance.
976 980
977 981 Instances have an ``ssh_client`` attribute containing a paramiko SSHClient
978 982 instance bound to the instance.
979 983
980 984 Instances have an ``ssh_private_key_path`` attributing containing the
981 985 str path to the SSH private key to connect to the instance.
982 986 """
983 987
984 988 block_device_mappings = [
985 989 {
986 990 'DeviceName': image.block_device_mappings[0]['DeviceName'],
987 991 'Ebs': {
988 992 'DeleteOnTermination': True,
989 993 'VolumeSize': 12,
990 994 'VolumeType': 'gp2',
991 995 },
992 996 }
993 997 ]
994 998
995 999 # This is not an exhaustive list of instance types having instance storage.
996 1000 # But
997 1001 if ensure_extra_volume and not instance_type.startswith(
998 1002 tuple(INSTANCE_TYPES_WITH_STORAGE)
999 1003 ):
1000 1004 main_device = block_device_mappings[0]['DeviceName']
1001 1005
1002 1006 if main_device == 'xvda':
1003 1007 second_device = 'xvdb'
1004 1008 elif main_device == '/dev/sda1':
1005 1009 second_device = '/dev/sdb'
1006 1010 else:
1007 1011 raise ValueError(
1008 1012 'unhandled primary EBS device name: %s' % main_device
1009 1013 )
1010 1014
1011 1015 block_device_mappings.append(
1012 1016 {
1013 1017 'DeviceName': second_device,
1014 1018 'Ebs': {
1015 1019 'DeleteOnTermination': True,
1016 1020 'VolumeSize': 8,
1017 1021 'VolumeType': 'gp2',
1018 1022 },
1019 1023 }
1020 1024 )
1021 1025
1022 1026 config = {
1023 1027 'BlockDeviceMappings': block_device_mappings,
1024 1028 'EbsOptimized': True,
1025 1029 'ImageId': image.id,
1026 1030 'InstanceInitiatedShutdownBehavior': 'terminate',
1027 1031 'InstanceType': instance_type,
1028 1032 'KeyName': '%sautomation' % prefix,
1029 1033 'MaxCount': 1,
1030 1034 'MinCount': 1,
1031 1035 'SecurityGroupIds': [c.security_groups['linux-dev-1'].id],
1032 1036 }
1033 1037
1034 1038 with temporary_ec2_instances(c.ec2resource, config) as instances:
1035 1039 wait_for_ip_addresses(instances)
1036 1040
1037 1041 ssh_private_key_path = str(c.key_pair_path_private('automation'))
1038 1042
1039 1043 for instance in instances:
1040 1044 client = wait_for_ssh(
1041 1045 instance.public_ip_address,
1042 1046 22,
1043 1047 username='hg',
1044 1048 key_filename=ssh_private_key_path,
1045 1049 )
1046 1050
1047 1051 instance.ssh_client = client
1048 1052 instance.ssh_private_key_path = ssh_private_key_path
1049 1053
1050 1054 try:
1051 1055 yield instances
1052 1056 finally:
1053 1057 for instance in instances:
1054 1058 instance.ssh_client.close()
1055 1059
1056 1060
1057 1061 def ensure_windows_dev_ami(
1058 1062 c: AWSConnection, prefix='hg-', base_image_name=WINDOWS_BASE_IMAGE_NAME
1059 1063 ):
1060 1064 """Ensure Windows Development AMI is available and up-to-date.
1061 1065
1062 1066 If necessary, a modern AMI will be built by starting a temporary EC2
1063 1067 instance and bootstrapping it.
1064 1068
1065 1069 Obsolete AMIs will be deleted so there is only a single AMI having the
1066 1070 desired name.
1067 1071
1068 1072 Returns an ``ec2.Image`` of either an existing AMI or a newly-built
1069 1073 one.
1070 1074 """
1071 1075 ec2client = c.ec2client
1072 1076 ec2resource = c.ec2resource
1073 1077 ssmclient = c.session.client('ssm')
1074 1078
1075 1079 name = '%s%s' % (prefix, 'windows-dev')
1076 1080
1077 1081 image = find_image(ec2resource, AMAZON_ACCOUNT_ID, base_image_name)
1078 1082
1079 1083 config = {
1080 1084 'BlockDeviceMappings': [
1081 1085 {
1082 1086 'DeviceName': '/dev/sda1',
1083 1087 'Ebs': {
1084 1088 'DeleteOnTermination': True,
1085 1089 'VolumeSize': 32,
1086 1090 'VolumeType': 'gp2',
1087 1091 },
1088 1092 }
1089 1093 ],
1090 1094 'ImageId': image.id,
1091 1095 'InstanceInitiatedShutdownBehavior': 'stop',
1092 1096 'InstanceType': 't3.medium',
1093 1097 'KeyName': '%sautomation' % prefix,
1094 1098 'MaxCount': 1,
1095 1099 'MinCount': 1,
1096 1100 'SecurityGroupIds': [c.security_groups['windows-dev-1'].id],
1097 1101 }
1098 1102
1099 1103 commands = [
1100 1104 # Need to start the service so sshd_config is generated.
1101 1105 'Start-Service sshd',
1102 1106 'Write-Output "modifying sshd_config"',
1103 1107 r'$content = Get-Content C:\ProgramData\ssh\sshd_config',
1104 1108 '$content = $content -replace "Match Group administrators","" -replace "AuthorizedKeysFile __PROGRAMDATA__/ssh/administrators_authorized_keys",""',
1105 1109 r'$content | Set-Content C:\ProgramData\ssh\sshd_config',
1106 1110 'Import-Module OpenSSHUtils',
1107 1111 r'Repair-SshdConfigPermission C:\ProgramData\ssh\sshd_config -Confirm:$false',
1108 1112 'Restart-Service sshd',
1109 1113 'Write-Output "installing OpenSSL client"',
1110 1114 'Add-WindowsCapability -Online -Name OpenSSH.Client~~~~0.0.1.0',
1111 1115 'Set-Service -Name sshd -StartupType "Automatic"',
1112 1116 'Write-Output "OpenSSH server running"',
1113 1117 ]
1114 1118
1115 1119 with INSTALL_WINDOWS_DEPENDENCIES.open('r', encoding='utf-8') as fh:
1116 1120 commands.extend(l.rstrip() for l in fh)
1117 1121
1118 1122 # Schedule run of EC2Launch on next boot. This ensures that UserData
1119 1123 # is executed.
1120 1124 # We disable setComputerName because it forces a reboot.
1121 1125 # We set an explicit admin password because this causes UserData to run
1122 1126 # as Administrator instead of System.
1123 1127 commands.extend(
1124 1128 [
1125 1129 r'''Set-Content -Path C:\ProgramData\Amazon\EC2-Windows\Launch\Config\LaunchConfig.json '''
1126 1130 r'''-Value '{"setComputerName": false, "setWallpaper": true, "addDnsSuffixList": true, '''
1127 1131 r'''"extendBootVolumeSize": true, "handleUserData": true, '''
1128 1132 r'''"adminPasswordType": "Specify", "adminPassword": "%s"}' '''
1129 1133 % c.automation.default_password(),
1130 1134 r'C:\ProgramData\Amazon\EC2-Windows\Launch\Scripts\InitializeInstance.ps1 '
1131 1135 r'–Schedule',
1132 1136 ]
1133 1137 )
1134 1138
1135 1139 # Disable Windows Defender when bootstrapping because it just slows
1136 1140 # things down.
1137 1141 commands.insert(0, 'Set-MpPreference -DisableRealtimeMonitoring $true')
1138 1142 commands.append('Set-MpPreference -DisableRealtimeMonitoring $false')
1139 1143
1140 1144 # Compute a deterministic fingerprint to determine whether image needs
1141 1145 # to be regenerated.
1142 1146 fingerprint = resolve_fingerprint(
1143 1147 {
1144 1148 'instance_config': config,
1145 1149 'user_data': WINDOWS_USER_DATA,
1146 1150 'initial_bootstrap': WINDOWS_BOOTSTRAP_POWERSHELL,
1147 1151 'bootstrap_commands': commands,
1148 1152 'base_image_name': base_image_name,
1149 1153 }
1150 1154 )
1151 1155
1152 1156 existing_image = find_and_reconcile_image(ec2resource, name, fingerprint)
1153 1157
1154 1158 if existing_image:
1155 1159 return existing_image
1156 1160
1157 1161 print('no suitable Windows development image found; creating one...')
1158 1162
1159 1163 with create_temp_windows_ec2_instances(
1160 1164 c, config, bootstrap=True
1161 1165 ) as instances:
1162 1166 assert len(instances) == 1
1163 1167 instance = instances[0]
1164 1168
1165 1169 wait_for_ssm(ssmclient, [instance])
1166 1170
1167 1171 # On first boot, install various Windows updates.
1168 1172 # We would ideally use PowerShell Remoting for this. However, there are
1169 1173 # trust issues that make it difficult to invoke Windows Update
1170 1174 # remotely. So we use SSM, which has a mechanism for running Windows
1171 1175 # Update.
1172 1176 print('installing Windows features...')
1173 1177 run_ssm_command(
1174 1178 ssmclient,
1175 1179 [instance],
1176 1180 'AWS-RunPowerShellScript',
1177 1181 {'commands': WINDOWS_BOOTSTRAP_POWERSHELL.split('\n'),},
1178 1182 )
1179 1183
1180 1184 # Reboot so all updates are fully applied.
1181 1185 #
1182 1186 # We don't use instance.reboot() here because it is asynchronous and
1183 1187 # we don't know when exactly the instance has rebooted. It could take
1184 1188 # a while to stop and we may start trying to interact with the instance
1185 1189 # before it has rebooted.
1186 1190 print('rebooting instance %s' % instance.id)
1187 1191 instance.stop()
1188 1192 ec2client.get_waiter('instance_stopped').wait(
1189 1193 InstanceIds=[instance.id], WaiterConfig={'Delay': 5,}
1190 1194 )
1191 1195
1192 1196 instance.start()
1193 1197 wait_for_ip_addresses([instance])
1194 1198
1195 1199 # There is a race condition here between the User Data PS script running
1196 1200 # and us connecting to WinRM. This can manifest as
1197 1201 # "AuthorizationManager check failed" failures during run_powershell().
1198 1202 # TODO figure out a workaround.
1199 1203
1200 1204 print('waiting for Windows Remote Management to come back...')
1201 1205 client = wait_for_winrm(
1202 1206 instance.public_ip_address,
1203 1207 'Administrator',
1204 1208 c.automation.default_password(),
1205 1209 )
1206 1210 print('established WinRM connection to %s' % instance.id)
1207 1211 instance.winrm_client = client
1208 1212
1209 1213 print('bootstrapping instance...')
1210 1214 run_powershell(instance.winrm_client, '\n'.join(commands))
1211 1215
1212 1216 print('bootstrap completed; stopping %s to create image' % instance.id)
1213 1217 return create_ami_from_instance(
1214 1218 ec2client,
1215 1219 instance,
1216 1220 name,
1217 1221 'Mercurial Windows development environment',
1218 1222 fingerprint,
1219 1223 )
1220 1224
1221 1225
1222 1226 @contextlib.contextmanager
1223 1227 def temporary_windows_dev_instances(
1224 1228 c: AWSConnection,
1225 1229 image,
1226 1230 instance_type,
1227 1231 prefix='hg-',
1228 1232 disable_antivirus=False,
1229 1233 ):
1230 1234 """Create a temporary Windows development EC2 instance.
1231 1235
1232 1236 Context manager resolves to the list of ``EC2.Instance`` that were created.
1233 1237 """
1234 1238 config = {
1235 1239 'BlockDeviceMappings': [
1236 1240 {
1237 1241 'DeviceName': '/dev/sda1',
1238 1242 'Ebs': {
1239 1243 'DeleteOnTermination': True,
1240 1244 'VolumeSize': 32,
1241 1245 'VolumeType': 'gp2',
1242 1246 },
1243 1247 }
1244 1248 ],
1245 1249 'ImageId': image.id,
1246 1250 'InstanceInitiatedShutdownBehavior': 'stop',
1247 1251 'InstanceType': instance_type,
1248 1252 'KeyName': '%sautomation' % prefix,
1249 1253 'MaxCount': 1,
1250 1254 'MinCount': 1,
1251 1255 'SecurityGroupIds': [c.security_groups['windows-dev-1'].id],
1252 1256 }
1253 1257
1254 1258 with create_temp_windows_ec2_instances(c, config) as instances:
1255 1259 if disable_antivirus:
1256 1260 for instance in instances:
1257 1261 run_powershell(
1258 1262 instance.winrm_client,
1259 1263 'Set-MpPreference -DisableRealtimeMonitoring $true',
1260 1264 )
1261 1265
1262 1266 yield instances
General Comments 0
You need to be logged in to leave comments. Login now