##// END OF EJS Templates
automation: increase root volume size on Linux...
Gregory Szorc -
r42925:3e3fb15b default
parent child Browse files
Show More
@@ -1,1209 +1,1209
1 1 # aws.py - Automation code for Amazon Web Services
2 2 #
3 3 # Copyright 2019 Gregory Szorc <gregory.szorc@gmail.com>
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 # no-check-code because Python 3 native.
9 9
10 10 import contextlib
11 11 import copy
12 12 import hashlib
13 13 import json
14 14 import os
15 15 import pathlib
16 16 import subprocess
17 17 import time
18 18
19 19 import boto3
20 20 import botocore.exceptions
21 21
22 22 from .linux import (
23 23 BOOTSTRAP_DEBIAN,
24 24 )
25 25 from .ssh import (
26 26 exec_command as ssh_exec_command,
27 27 wait_for_ssh,
28 28 )
29 29 from .winrm import (
30 30 run_powershell,
31 31 wait_for_winrm,
32 32 )
33 33
34 34
35 35 SOURCE_ROOT = pathlib.Path(os.path.abspath(__file__)).parent.parent.parent.parent
36 36
37 37 INSTALL_WINDOWS_DEPENDENCIES = (SOURCE_ROOT / 'contrib' /
38 38 'install-windows-dependencies.ps1')
39 39
40 40
41 41 INSTANCE_TYPES_WITH_STORAGE = {
42 42 'c5d',
43 43 'd2',
44 44 'h1',
45 45 'i3',
46 46 'm5ad',
47 47 'm5d',
48 48 'r5d',
49 49 'r5ad',
50 50 'x1',
51 51 'z1d',
52 52 }
53 53
54 54
55 55 AMAZON_ACCOUNT_ID = '801119661308'
56 56 DEBIAN_ACCOUNT_ID = '379101102735'
57 57 UBUNTU_ACCOUNT_ID = '099720109477'
58 58
59 59
60 60 WINDOWS_BASE_IMAGE_NAME = 'Windows_Server-2019-English-Full-Base-2019.07.12'
61 61
62 62
63 63 KEY_PAIRS = {
64 64 'automation',
65 65 }
66 66
67 67
68 68 SECURITY_GROUPS = {
69 69 'linux-dev-1': {
70 70 'description': 'Mercurial Linux instances that perform build/test automation',
71 71 'ingress': [
72 72 {
73 73 'FromPort': 22,
74 74 'ToPort': 22,
75 75 'IpProtocol': 'tcp',
76 76 'IpRanges': [
77 77 {
78 78 'CidrIp': '0.0.0.0/0',
79 79 'Description': 'SSH from entire Internet',
80 80 },
81 81 ],
82 82 },
83 83 ],
84 84 },
85 85 'windows-dev-1': {
86 86 'description': 'Mercurial Windows instances that perform build automation',
87 87 'ingress': [
88 88 {
89 89 'FromPort': 22,
90 90 'ToPort': 22,
91 91 'IpProtocol': 'tcp',
92 92 'IpRanges': [
93 93 {
94 94 'CidrIp': '0.0.0.0/0',
95 95 'Description': 'SSH from entire Internet',
96 96 },
97 97 ],
98 98 },
99 99 {
100 100 'FromPort': 3389,
101 101 'ToPort': 3389,
102 102 'IpProtocol': 'tcp',
103 103 'IpRanges': [
104 104 {
105 105 'CidrIp': '0.0.0.0/0',
106 106 'Description': 'RDP from entire Internet',
107 107 },
108 108 ],
109 109
110 110 },
111 111 {
112 112 'FromPort': 5985,
113 113 'ToPort': 5986,
114 114 'IpProtocol': 'tcp',
115 115 'IpRanges': [
116 116 {
117 117 'CidrIp': '0.0.0.0/0',
118 118 'Description': 'PowerShell Remoting (Windows Remote Management)',
119 119 },
120 120 ],
121 121 }
122 122 ],
123 123 },
124 124 }
125 125
126 126
127 127 IAM_ROLES = {
128 128 'ephemeral-ec2-role-1': {
129 129 'description': 'Mercurial temporary EC2 instances',
130 130 'policy_arns': [
131 131 'arn:aws:iam::aws:policy/service-role/AmazonEC2RoleforSSM',
132 132 ],
133 133 },
134 134 }
135 135
136 136
137 137 ASSUME_ROLE_POLICY_DOCUMENT = '''
138 138 {
139 139 "Version": "2012-10-17",
140 140 "Statement": [
141 141 {
142 142 "Effect": "Allow",
143 143 "Principal": {
144 144 "Service": "ec2.amazonaws.com"
145 145 },
146 146 "Action": "sts:AssumeRole"
147 147 }
148 148 ]
149 149 }
150 150 '''.strip()
151 151
152 152
153 153 IAM_INSTANCE_PROFILES = {
154 154 'ephemeral-ec2-1': {
155 155 'roles': [
156 156 'ephemeral-ec2-role-1',
157 157 ],
158 158 }
159 159 }
160 160
161 161
162 162 # User Data for Windows EC2 instance. Mainly used to set the password
163 163 # and configure WinRM.
164 164 # Inspired by the User Data script used by Packer
165 165 # (from https://www.packer.io/intro/getting-started/build-image.html).
166 166 WINDOWS_USER_DATA = r'''
167 167 <powershell>
168 168
169 169 # TODO enable this once we figure out what is failing.
170 170 #$ErrorActionPreference = "stop"
171 171
172 172 # Set administrator password
173 173 net user Administrator "%s"
174 174 wmic useraccount where "name='Administrator'" set PasswordExpires=FALSE
175 175
176 176 # First, make sure WinRM can't be connected to
177 177 netsh advfirewall firewall set rule name="Windows Remote Management (HTTP-In)" new enable=yes action=block
178 178
179 179 # Delete any existing WinRM listeners
180 180 winrm delete winrm/config/listener?Address=*+Transport=HTTP 2>$Null
181 181 winrm delete winrm/config/listener?Address=*+Transport=HTTPS 2>$Null
182 182
183 183 # Create a new WinRM listener and configure
184 184 winrm create winrm/config/listener?Address=*+Transport=HTTP
185 185 winrm set winrm/config/winrs '@{MaxMemoryPerShellMB="0"}'
186 186 winrm set winrm/config '@{MaxTimeoutms="7200000"}'
187 187 winrm set winrm/config/service '@{AllowUnencrypted="true"}'
188 188 winrm set winrm/config/service '@{MaxConcurrentOperationsPerUser="12000"}'
189 189 winrm set winrm/config/service/auth '@{Basic="true"}'
190 190 winrm set winrm/config/client/auth '@{Basic="true"}'
191 191
192 192 # Configure UAC to allow privilege elevation in remote shells
193 193 $Key = 'HKLM:\SOFTWARE\Microsoft\Windows\CurrentVersion\Policies\System'
194 194 $Setting = 'LocalAccountTokenFilterPolicy'
195 195 Set-ItemProperty -Path $Key -Name $Setting -Value 1 -Force
196 196
197 197 # Configure and restart the WinRM Service; Enable the required firewall exception
198 198 Stop-Service -Name WinRM
199 199 Set-Service -Name WinRM -StartupType Automatic
200 200 netsh advfirewall firewall set rule name="Windows Remote Management (HTTP-In)" new action=allow localip=any remoteip=any
201 201 Start-Service -Name WinRM
202 202
203 203 # Disable firewall on private network interfaces so prompts don't appear.
204 204 Set-NetFirewallProfile -Name private -Enabled false
205 205 </powershell>
206 206 '''.lstrip()
207 207
208 208
209 209 WINDOWS_BOOTSTRAP_POWERSHELL = '''
210 210 Write-Output "installing PowerShell dependencies"
211 211 Install-PackageProvider -Name NuGet -MinimumVersion 2.8.5.201 -Force
212 212 Set-PSRepository -Name PSGallery -InstallationPolicy Trusted
213 213 Install-Module -Name OpenSSHUtils -RequiredVersion 0.0.2.0
214 214
215 215 Write-Output "installing OpenSSL server"
216 216 Add-WindowsCapability -Online -Name OpenSSH.Server~~~~0.0.1.0
217 217 # Various tools will attempt to use older versions of .NET. So we enable
218 218 # the feature that provides them so it doesn't have to be auto-enabled
219 219 # later.
220 220 Write-Output "enabling .NET Framework feature"
221 221 Install-WindowsFeature -Name Net-Framework-Core
222 222 '''
223 223
224 224
225 225 class AWSConnection:
226 226 """Manages the state of a connection with AWS."""
227 227
228 228 def __init__(self, automation, region: str, ensure_ec2_state: bool=True):
229 229 self.automation = automation
230 230 self.local_state_path = automation.state_path
231 231
232 232 self.prefix = 'hg-'
233 233
234 234 self.session = boto3.session.Session(region_name=region)
235 235 self.ec2client = self.session.client('ec2')
236 236 self.ec2resource = self.session.resource('ec2')
237 237 self.iamclient = self.session.client('iam')
238 238 self.iamresource = self.session.resource('iam')
239 239 self.security_groups = {}
240 240
241 241 if ensure_ec2_state:
242 242 ensure_key_pairs(automation.state_path, self.ec2resource)
243 243 self.security_groups = ensure_security_groups(self.ec2resource)
244 244 ensure_iam_state(self.iamclient, self.iamresource)
245 245
246 246 def key_pair_path_private(self, name):
247 247 """Path to a key pair private key file."""
248 248 return self.local_state_path / 'keys' / ('keypair-%s' % name)
249 249
250 250 def key_pair_path_public(self, name):
251 251 return self.local_state_path / 'keys' / ('keypair-%s.pub' % name)
252 252
253 253
254 254 def rsa_key_fingerprint(p: pathlib.Path):
255 255 """Compute the fingerprint of an RSA private key."""
256 256
257 257 # TODO use rsa package.
258 258 res = subprocess.run(
259 259 ['openssl', 'pkcs8', '-in', str(p), '-nocrypt', '-topk8',
260 260 '-outform', 'DER'],
261 261 capture_output=True,
262 262 check=True)
263 263
264 264 sha1 = hashlib.sha1(res.stdout).hexdigest()
265 265 return ':'.join(a + b for a, b in zip(sha1[::2], sha1[1::2]))
266 266
267 267
268 268 def ensure_key_pairs(state_path: pathlib.Path, ec2resource, prefix='hg-'):
269 269 remote_existing = {}
270 270
271 271 for kpi in ec2resource.key_pairs.all():
272 272 if kpi.name.startswith(prefix):
273 273 remote_existing[kpi.name[len(prefix):]] = kpi.key_fingerprint
274 274
275 275 # Validate that we have these keys locally.
276 276 key_path = state_path / 'keys'
277 277 key_path.mkdir(exist_ok=True, mode=0o700)
278 278
279 279 def remove_remote(name):
280 280 print('deleting key pair %s' % name)
281 281 key = ec2resource.KeyPair(name)
282 282 key.delete()
283 283
284 284 def remove_local(name):
285 285 pub_full = key_path / ('keypair-%s.pub' % name)
286 286 priv_full = key_path / ('keypair-%s' % name)
287 287
288 288 print('removing %s' % pub_full)
289 289 pub_full.unlink()
290 290 print('removing %s' % priv_full)
291 291 priv_full.unlink()
292 292
293 293 local_existing = {}
294 294
295 295 for f in sorted(os.listdir(key_path)):
296 296 if not f.startswith('keypair-') or not f.endswith('.pub'):
297 297 continue
298 298
299 299 name = f[len('keypair-'):-len('.pub')]
300 300
301 301 pub_full = key_path / f
302 302 priv_full = key_path / ('keypair-%s' % name)
303 303
304 304 with open(pub_full, 'r', encoding='ascii') as fh:
305 305 data = fh.read()
306 306
307 307 if not data.startswith('ssh-rsa '):
308 308 print('unexpected format for key pair file: %s; removing' %
309 309 pub_full)
310 310 pub_full.unlink()
311 311 priv_full.unlink()
312 312 continue
313 313
314 314 local_existing[name] = rsa_key_fingerprint(priv_full)
315 315
316 316 for name in sorted(set(remote_existing) | set(local_existing)):
317 317 if name not in local_existing:
318 318 actual = '%s%s' % (prefix, name)
319 319 print('remote key %s does not exist locally' % name)
320 320 remove_remote(actual)
321 321 del remote_existing[name]
322 322
323 323 elif name not in remote_existing:
324 324 print('local key %s does not exist remotely' % name)
325 325 remove_local(name)
326 326 del local_existing[name]
327 327
328 328 elif remote_existing[name] != local_existing[name]:
329 329 print('key fingerprint mismatch for %s; '
330 330 'removing from local and remote' % name)
331 331 remove_local(name)
332 332 remove_remote('%s%s' % (prefix, name))
333 333 del local_existing[name]
334 334 del remote_existing[name]
335 335
336 336 missing = KEY_PAIRS - set(remote_existing)
337 337
338 338 for name in sorted(missing):
339 339 actual = '%s%s' % (prefix, name)
340 340 print('creating key pair %s' % actual)
341 341
342 342 priv_full = key_path / ('keypair-%s' % name)
343 343 pub_full = key_path / ('keypair-%s.pub' % name)
344 344
345 345 kp = ec2resource.create_key_pair(KeyName=actual)
346 346
347 347 with priv_full.open('w', encoding='ascii') as fh:
348 348 fh.write(kp.key_material)
349 349 fh.write('\n')
350 350
351 351 priv_full.chmod(0o0600)
352 352
353 353 # SSH public key can be extracted via `ssh-keygen`.
354 354 with pub_full.open('w', encoding='ascii') as fh:
355 355 subprocess.run(
356 356 ['ssh-keygen', '-y', '-f', str(priv_full)],
357 357 stdout=fh,
358 358 check=True)
359 359
360 360 pub_full.chmod(0o0600)
361 361
362 362
363 363 def delete_instance_profile(profile):
364 364 for role in profile.roles:
365 365 print('removing role %s from instance profile %s' % (role.name,
366 366 profile.name))
367 367 profile.remove_role(RoleName=role.name)
368 368
369 369 print('deleting instance profile %s' % profile.name)
370 370 profile.delete()
371 371
372 372
373 373 def ensure_iam_state(iamclient, iamresource, prefix='hg-'):
374 374 """Ensure IAM state is in sync with our canonical definition."""
375 375
376 376 remote_profiles = {}
377 377
378 378 for profile in iamresource.instance_profiles.all():
379 379 if profile.name.startswith(prefix):
380 380 remote_profiles[profile.name[len(prefix):]] = profile
381 381
382 382 for name in sorted(set(remote_profiles) - set(IAM_INSTANCE_PROFILES)):
383 383 delete_instance_profile(remote_profiles[name])
384 384 del remote_profiles[name]
385 385
386 386 remote_roles = {}
387 387
388 388 for role in iamresource.roles.all():
389 389 if role.name.startswith(prefix):
390 390 remote_roles[role.name[len(prefix):]] = role
391 391
392 392 for name in sorted(set(remote_roles) - set(IAM_ROLES)):
393 393 role = remote_roles[name]
394 394
395 395 print('removing role %s' % role.name)
396 396 role.delete()
397 397 del remote_roles[name]
398 398
399 399 # We've purged remote state that doesn't belong. Create missing
400 400 # instance profiles and roles.
401 401 for name in sorted(set(IAM_INSTANCE_PROFILES) - set(remote_profiles)):
402 402 actual = '%s%s' % (prefix, name)
403 403 print('creating IAM instance profile %s' % actual)
404 404
405 405 profile = iamresource.create_instance_profile(
406 406 InstanceProfileName=actual)
407 407 remote_profiles[name] = profile
408 408
409 409 waiter = iamclient.get_waiter('instance_profile_exists')
410 410 waiter.wait(InstanceProfileName=actual)
411 411 print('IAM instance profile %s is available' % actual)
412 412
413 413 for name in sorted(set(IAM_ROLES) - set(remote_roles)):
414 414 entry = IAM_ROLES[name]
415 415
416 416 actual = '%s%s' % (prefix, name)
417 417 print('creating IAM role %s' % actual)
418 418
419 419 role = iamresource.create_role(
420 420 RoleName=actual,
421 421 Description=entry['description'],
422 422 AssumeRolePolicyDocument=ASSUME_ROLE_POLICY_DOCUMENT,
423 423 )
424 424
425 425 waiter = iamclient.get_waiter('role_exists')
426 426 waiter.wait(RoleName=actual)
427 427 print('IAM role %s is available' % actual)
428 428
429 429 remote_roles[name] = role
430 430
431 431 for arn in entry['policy_arns']:
432 432 print('attaching policy %s to %s' % (arn, role.name))
433 433 role.attach_policy(PolicyArn=arn)
434 434
435 435 # Now reconcile state of profiles.
436 436 for name, meta in sorted(IAM_INSTANCE_PROFILES.items()):
437 437 profile = remote_profiles[name]
438 438 wanted = {'%s%s' % (prefix, role) for role in meta['roles']}
439 439 have = {role.name for role in profile.roles}
440 440
441 441 for role in sorted(have - wanted):
442 442 print('removing role %s from %s' % (role, profile.name))
443 443 profile.remove_role(RoleName=role)
444 444
445 445 for role in sorted(wanted - have):
446 446 print('adding role %s to %s' % (role, profile.name))
447 447 profile.add_role(RoleName=role)
448 448
449 449
450 450 def find_image(ec2resource, owner_id, name):
451 451 """Find an AMI by its owner ID and name."""
452 452
453 453 images = ec2resource.images.filter(
454 454 Filters=[
455 455 {
456 456 'Name': 'owner-id',
457 457 'Values': [owner_id],
458 458 },
459 459 {
460 460 'Name': 'state',
461 461 'Values': ['available'],
462 462 },
463 463 {
464 464 'Name': 'image-type',
465 465 'Values': ['machine'],
466 466 },
467 467 {
468 468 'Name': 'name',
469 469 'Values': [name],
470 470 },
471 471 ])
472 472
473 473 for image in images:
474 474 return image
475 475
476 476 raise Exception('unable to find image for %s' % name)
477 477
478 478
479 479 def ensure_security_groups(ec2resource, prefix='hg-'):
480 480 """Ensure all necessary Mercurial security groups are present.
481 481
482 482 All security groups are prefixed with ``hg-`` by default. Any security
483 483 groups having this prefix but aren't in our list are deleted.
484 484 """
485 485 existing = {}
486 486
487 487 for group in ec2resource.security_groups.all():
488 488 if group.group_name.startswith(prefix):
489 489 existing[group.group_name[len(prefix):]] = group
490 490
491 491 purge = set(existing) - set(SECURITY_GROUPS)
492 492
493 493 for name in sorted(purge):
494 494 group = existing[name]
495 495 print('removing legacy security group: %s' % group.group_name)
496 496 group.delete()
497 497
498 498 security_groups = {}
499 499
500 500 for name, group in sorted(SECURITY_GROUPS.items()):
501 501 if name in existing:
502 502 security_groups[name] = existing[name]
503 503 continue
504 504
505 505 actual = '%s%s' % (prefix, name)
506 506 print('adding security group %s' % actual)
507 507
508 508 group_res = ec2resource.create_security_group(
509 509 Description=group['description'],
510 510 GroupName=actual,
511 511 )
512 512
513 513 group_res.authorize_ingress(
514 514 IpPermissions=group['ingress'],
515 515 )
516 516
517 517 security_groups[name] = group_res
518 518
519 519 return security_groups
520 520
521 521
522 522 def terminate_ec2_instances(ec2resource, prefix='hg-'):
523 523 """Terminate all EC2 instances managed by us."""
524 524 waiting = []
525 525
526 526 for instance in ec2resource.instances.all():
527 527 if instance.state['Name'] == 'terminated':
528 528 continue
529 529
530 530 for tag in instance.tags or []:
531 531 if tag['Key'] == 'Name' and tag['Value'].startswith(prefix):
532 532 print('terminating %s' % instance.id)
533 533 instance.terminate()
534 534 waiting.append(instance)
535 535
536 536 for instance in waiting:
537 537 instance.wait_until_terminated()
538 538
539 539
540 540 def remove_resources(c, prefix='hg-'):
541 541 """Purge all of our resources in this EC2 region."""
542 542 ec2resource = c.ec2resource
543 543 iamresource = c.iamresource
544 544
545 545 terminate_ec2_instances(ec2resource, prefix=prefix)
546 546
547 547 for image in ec2resource.images.filter(Owners=['self']):
548 548 if image.name.startswith(prefix):
549 549 remove_ami(ec2resource, image)
550 550
551 551 for group in ec2resource.security_groups.all():
552 552 if group.group_name.startswith(prefix):
553 553 print('removing security group %s' % group.group_name)
554 554 group.delete()
555 555
556 556 for profile in iamresource.instance_profiles.all():
557 557 if profile.name.startswith(prefix):
558 558 delete_instance_profile(profile)
559 559
560 560 for role in iamresource.roles.all():
561 561 if role.name.startswith(prefix):
562 562 for p in role.attached_policies.all():
563 563 print('detaching policy %s from %s' % (p.arn, role.name))
564 564 role.detach_policy(PolicyArn=p.arn)
565 565
566 566 print('removing role %s' % role.name)
567 567 role.delete()
568 568
569 569
570 570 def wait_for_ip_addresses(instances):
571 571 """Wait for the public IP addresses of an iterable of instances."""
572 572 for instance in instances:
573 573 while True:
574 574 if not instance.public_ip_address:
575 575 time.sleep(2)
576 576 instance.reload()
577 577 continue
578 578
579 579 print('public IP address for %s: %s' % (
580 580 instance.id, instance.public_ip_address))
581 581 break
582 582
583 583
584 584 def remove_ami(ec2resource, image):
585 585 """Remove an AMI and its underlying snapshots."""
586 586 snapshots = []
587 587
588 588 for device in image.block_device_mappings:
589 589 if 'Ebs' in device:
590 590 snapshots.append(ec2resource.Snapshot(device['Ebs']['SnapshotId']))
591 591
592 592 print('deregistering %s' % image.id)
593 593 image.deregister()
594 594
595 595 for snapshot in snapshots:
596 596 print('deleting snapshot %s' % snapshot.id)
597 597 snapshot.delete()
598 598
599 599
600 600 def wait_for_ssm(ssmclient, instances):
601 601 """Wait for SSM to come online for an iterable of instance IDs."""
602 602 while True:
603 603 res = ssmclient.describe_instance_information(
604 604 Filters=[
605 605 {
606 606 'Key': 'InstanceIds',
607 607 'Values': [i.id for i in instances],
608 608 },
609 609 ],
610 610 )
611 611
612 612 available = len(res['InstanceInformationList'])
613 613 wanted = len(instances)
614 614
615 615 print('%d/%d instances available in SSM' % (available, wanted))
616 616
617 617 if available == wanted:
618 618 return
619 619
620 620 time.sleep(2)
621 621
622 622
623 623 def run_ssm_command(ssmclient, instances, document_name, parameters):
624 624 """Run a PowerShell script on an EC2 instance."""
625 625
626 626 res = ssmclient.send_command(
627 627 InstanceIds=[i.id for i in instances],
628 628 DocumentName=document_name,
629 629 Parameters=parameters,
630 630 CloudWatchOutputConfig={
631 631 'CloudWatchOutputEnabled': True,
632 632 },
633 633 )
634 634
635 635 command_id = res['Command']['CommandId']
636 636
637 637 for instance in instances:
638 638 while True:
639 639 try:
640 640 res = ssmclient.get_command_invocation(
641 641 CommandId=command_id,
642 642 InstanceId=instance.id,
643 643 )
644 644 except botocore.exceptions.ClientError as e:
645 645 if e.response['Error']['Code'] == 'InvocationDoesNotExist':
646 646 print('could not find SSM command invocation; waiting')
647 647 time.sleep(1)
648 648 continue
649 649 else:
650 650 raise
651 651
652 652 if res['Status'] == 'Success':
653 653 break
654 654 elif res['Status'] in ('Pending', 'InProgress', 'Delayed'):
655 655 time.sleep(2)
656 656 else:
657 657 raise Exception('command failed on %s: %s' % (
658 658 instance.id, res['Status']))
659 659
660 660
661 661 @contextlib.contextmanager
662 662 def temporary_ec2_instances(ec2resource, config):
663 663 """Create temporary EC2 instances.
664 664
665 665 This is a proxy to ``ec2client.run_instances(**config)`` that takes care of
666 666 managing the lifecycle of the instances.
667 667
668 668 When the context manager exits, the instances are terminated.
669 669
670 670 The context manager evaluates to the list of data structures
671 671 describing each created instance. The instances may not be available
672 672 for work immediately: it is up to the caller to wait for the instance
673 673 to start responding.
674 674 """
675 675
676 676 ids = None
677 677
678 678 try:
679 679 res = ec2resource.create_instances(**config)
680 680
681 681 ids = [i.id for i in res]
682 682 print('started instances: %s' % ' '.join(ids))
683 683
684 684 yield res
685 685 finally:
686 686 if ids:
687 687 print('terminating instances: %s' % ' '.join(ids))
688 688 for instance in res:
689 689 instance.terminate()
690 690 print('terminated %d instances' % len(ids))
691 691
692 692
693 693 @contextlib.contextmanager
694 694 def create_temp_windows_ec2_instances(c: AWSConnection, config):
695 695 """Create temporary Windows EC2 instances.
696 696
697 697 This is a higher-level wrapper around ``create_temp_ec2_instances()`` that
698 698 configures the Windows instance for Windows Remote Management. The emitted
699 699 instances will have a ``winrm_client`` attribute containing a
700 700 ``pypsrp.client.Client`` instance bound to the instance.
701 701 """
702 702 if 'IamInstanceProfile' in config:
703 703 raise ValueError('IamInstanceProfile cannot be provided in config')
704 704 if 'UserData' in config:
705 705 raise ValueError('UserData cannot be provided in config')
706 706
707 707 password = c.automation.default_password()
708 708
709 709 config = copy.deepcopy(config)
710 710 config['IamInstanceProfile'] = {
711 711 'Name': 'hg-ephemeral-ec2-1',
712 712 }
713 713 config.setdefault('TagSpecifications', []).append({
714 714 'ResourceType': 'instance',
715 715 'Tags': [{'Key': 'Name', 'Value': 'hg-temp-windows'}],
716 716 })
717 717 config['UserData'] = WINDOWS_USER_DATA % password
718 718
719 719 with temporary_ec2_instances(c.ec2resource, config) as instances:
720 720 wait_for_ip_addresses(instances)
721 721
722 722 print('waiting for Windows Remote Management service...')
723 723
724 724 for instance in instances:
725 725 client = wait_for_winrm(instance.public_ip_address, 'Administrator', password)
726 726 print('established WinRM connection to %s' % instance.id)
727 727 instance.winrm_client = client
728 728
729 729 yield instances
730 730
731 731
732 732 def resolve_fingerprint(fingerprint):
733 733 fingerprint = json.dumps(fingerprint, sort_keys=True)
734 734 return hashlib.sha256(fingerprint.encode('utf-8')).hexdigest()
735 735
736 736
737 737 def find_and_reconcile_image(ec2resource, name, fingerprint):
738 738 """Attempt to find an existing EC2 AMI with a name and fingerprint.
739 739
740 740 If an image with the specified fingerprint is found, it is returned.
741 741 Otherwise None is returned.
742 742
743 743 Existing images for the specified name that don't have the specified
744 744 fingerprint or are missing required metadata or deleted.
745 745 """
746 746 # Find existing AMIs with this name and delete the ones that are invalid.
747 747 # Store a reference to a good image so it can be returned one the
748 748 # image state is reconciled.
749 749 images = ec2resource.images.filter(
750 750 Filters=[{'Name': 'name', 'Values': [name]}])
751 751
752 752 existing_image = None
753 753
754 754 for image in images:
755 755 if image.tags is None:
756 756 print('image %s for %s lacks required tags; removing' % (
757 757 image.id, image.name))
758 758 remove_ami(ec2resource, image)
759 759 else:
760 760 tags = {t['Key']: t['Value'] for t in image.tags}
761 761
762 762 if tags.get('HGIMAGEFINGERPRINT') == fingerprint:
763 763 existing_image = image
764 764 else:
765 765 print('image %s for %s has wrong fingerprint; removing' % (
766 766 image.id, image.name))
767 767 remove_ami(ec2resource, image)
768 768
769 769 return existing_image
770 770
771 771
772 772 def create_ami_from_instance(ec2client, instance, name, description,
773 773 fingerprint):
774 774 """Create an AMI from a running instance.
775 775
776 776 Returns the ``ec2resource.Image`` representing the created AMI.
777 777 """
778 778 instance.stop()
779 779
780 780 ec2client.get_waiter('instance_stopped').wait(
781 781 InstanceIds=[instance.id],
782 782 WaiterConfig={
783 783 'Delay': 5,
784 784 })
785 785 print('%s is stopped' % instance.id)
786 786
787 787 image = instance.create_image(
788 788 Name=name,
789 789 Description=description,
790 790 )
791 791
792 792 image.create_tags(Tags=[
793 793 {
794 794 'Key': 'HGIMAGEFINGERPRINT',
795 795 'Value': fingerprint,
796 796 },
797 797 ])
798 798
799 799 print('waiting for image %s' % image.id)
800 800
801 801 ec2client.get_waiter('image_available').wait(
802 802 ImageIds=[image.id],
803 803 )
804 804
805 805 print('image %s available as %s' % (image.id, image.name))
806 806
807 807 return image
808 808
809 809
810 810 def ensure_linux_dev_ami(c: AWSConnection, distro='debian9', prefix='hg-'):
811 811 """Ensures a Linux development AMI is available and up-to-date.
812 812
813 813 Returns an ``ec2.Image`` of either an existing AMI or a newly-built one.
814 814 """
815 815 ec2client = c.ec2client
816 816 ec2resource = c.ec2resource
817 817
818 818 name = '%s%s-%s' % (prefix, 'linux-dev', distro)
819 819
820 820 if distro == 'debian9':
821 821 image = find_image(
822 822 ec2resource,
823 823 DEBIAN_ACCOUNT_ID,
824 824 'debian-stretch-hvm-x86_64-gp2-2019-02-19-26620',
825 825 )
826 826 ssh_username = 'admin'
827 827 elif distro == 'ubuntu18.04':
828 828 image = find_image(
829 829 ec2resource,
830 830 UBUNTU_ACCOUNT_ID,
831 831 'ubuntu/images/hvm-ssd/ubuntu-bionic-18.04-amd64-server-20190403',
832 832 )
833 833 ssh_username = 'ubuntu'
834 834 elif distro == 'ubuntu18.10':
835 835 image = find_image(
836 836 ec2resource,
837 837 UBUNTU_ACCOUNT_ID,
838 838 'ubuntu/images/hvm-ssd/ubuntu-cosmic-18.10-amd64-server-20190402',
839 839 )
840 840 ssh_username = 'ubuntu'
841 841 elif distro == 'ubuntu19.04':
842 842 image = find_image(
843 843 ec2resource,
844 844 UBUNTU_ACCOUNT_ID,
845 845 'ubuntu/images/hvm-ssd/ubuntu-disco-19.04-amd64-server-20190417',
846 846 )
847 847 ssh_username = 'ubuntu'
848 848 else:
849 849 raise ValueError('unsupported Linux distro: %s' % distro)
850 850
851 851 config = {
852 852 'BlockDeviceMappings': [
853 853 {
854 854 'DeviceName': image.block_device_mappings[0]['DeviceName'],
855 855 'Ebs': {
856 856 'DeleteOnTermination': True,
857 857 'VolumeSize': 8,
858 858 'VolumeType': 'gp2',
859 859 },
860 860 },
861 861 ],
862 862 'EbsOptimized': True,
863 863 'ImageId': image.id,
864 864 'InstanceInitiatedShutdownBehavior': 'stop',
865 865 # 8 VCPUs for compiling Python.
866 866 'InstanceType': 't3.2xlarge',
867 867 'KeyName': '%sautomation' % prefix,
868 868 'MaxCount': 1,
869 869 'MinCount': 1,
870 870 'SecurityGroupIds': [c.security_groups['linux-dev-1'].id],
871 871 }
872 872
873 873 requirements2_path = (pathlib.Path(__file__).parent.parent /
874 874 'linux-requirements-py2.txt')
875 875 requirements3_path = (pathlib.Path(__file__).parent.parent /
876 876 'linux-requirements-py3.txt')
877 877 with requirements2_path.open('r', encoding='utf-8') as fh:
878 878 requirements2 = fh.read()
879 879 with requirements3_path.open('r', encoding='utf-8') as fh:
880 880 requirements3 = fh.read()
881 881
882 882 # Compute a deterministic fingerprint to determine whether image needs to
883 883 # be regenerated.
884 884 fingerprint = resolve_fingerprint({
885 885 'instance_config': config,
886 886 'bootstrap_script': BOOTSTRAP_DEBIAN,
887 887 'requirements_py2': requirements2,
888 888 'requirements_py3': requirements3,
889 889 })
890 890
891 891 existing_image = find_and_reconcile_image(ec2resource, name, fingerprint)
892 892
893 893 if existing_image:
894 894 return existing_image
895 895
896 896 print('no suitable %s image found; creating one...' % name)
897 897
898 898 with temporary_ec2_instances(ec2resource, config) as instances:
899 899 wait_for_ip_addresses(instances)
900 900
901 901 instance = instances[0]
902 902
903 903 client = wait_for_ssh(
904 904 instance.public_ip_address, 22,
905 905 username=ssh_username,
906 906 key_filename=str(c.key_pair_path_private('automation')))
907 907
908 908 home = '/home/%s' % ssh_username
909 909
910 910 with client:
911 911 print('connecting to SSH server')
912 912 sftp = client.open_sftp()
913 913
914 914 print('uploading bootstrap files')
915 915 with sftp.open('%s/bootstrap' % home, 'wb') as fh:
916 916 fh.write(BOOTSTRAP_DEBIAN)
917 917 fh.chmod(0o0700)
918 918
919 919 with sftp.open('%s/requirements-py2.txt' % home, 'wb') as fh:
920 920 fh.write(requirements2)
921 921 fh.chmod(0o0700)
922 922
923 923 with sftp.open('%s/requirements-py3.txt' % home, 'wb') as fh:
924 924 fh.write(requirements3)
925 925 fh.chmod(0o0700)
926 926
927 927 print('executing bootstrap')
928 928 chan, stdin, stdout = ssh_exec_command(client,
929 929 '%s/bootstrap' % home)
930 930 stdin.close()
931 931
932 932 for line in stdout:
933 933 print(line, end='')
934 934
935 935 res = chan.recv_exit_status()
936 936 if res:
937 937 raise Exception('non-0 exit from bootstrap: %d' % res)
938 938
939 939 print('bootstrap completed; stopping %s to create %s' % (
940 940 instance.id, name))
941 941
942 942 return create_ami_from_instance(ec2client, instance, name,
943 943 'Mercurial Linux development environment',
944 944 fingerprint)
945 945
946 946
947 947 @contextlib.contextmanager
948 948 def temporary_linux_dev_instances(c: AWSConnection, image, instance_type,
949 949 prefix='hg-', ensure_extra_volume=False):
950 950 """Create temporary Linux development EC2 instances.
951 951
952 952 Context manager resolves to a list of ``ec2.Instance`` that were created
953 953 and are running.
954 954
955 955 ``ensure_extra_volume`` can be set to ``True`` to require that instances
956 956 have a 2nd storage volume available other than the primary AMI volume.
957 957 For instance types with instance storage, this does nothing special.
958 958 But for instance types without instance storage, an additional EBS volume
959 959 will be added to the instance.
960 960
961 961 Instances have an ``ssh_client`` attribute containing a paramiko SSHClient
962 962 instance bound to the instance.
963 963
964 964 Instances have an ``ssh_private_key_path`` attributing containing the
965 965 str path to the SSH private key to connect to the instance.
966 966 """
967 967
968 968 block_device_mappings = [
969 969 {
970 970 'DeviceName': image.block_device_mappings[0]['DeviceName'],
971 971 'Ebs': {
972 972 'DeleteOnTermination': True,
973 'VolumeSize': 8,
973 'VolumeSize': 12,
974 974 'VolumeType': 'gp2',
975 975 },
976 976 }
977 977 ]
978 978
979 979 # This is not an exhaustive list of instance types having instance storage.
980 980 # But
981 981 if (ensure_extra_volume
982 982 and not instance_type.startswith(tuple(INSTANCE_TYPES_WITH_STORAGE))):
983 983 main_device = block_device_mappings[0]['DeviceName']
984 984
985 985 if main_device == 'xvda':
986 986 second_device = 'xvdb'
987 987 elif main_device == '/dev/sda1':
988 988 second_device = '/dev/sdb'
989 989 else:
990 990 raise ValueError('unhandled primary EBS device name: %s' %
991 991 main_device)
992 992
993 993 block_device_mappings.append({
994 994 'DeviceName': second_device,
995 995 'Ebs': {
996 996 'DeleteOnTermination': True,
997 997 'VolumeSize': 8,
998 998 'VolumeType': 'gp2',
999 999 }
1000 1000 })
1001 1001
1002 1002 config = {
1003 1003 'BlockDeviceMappings': block_device_mappings,
1004 1004 'EbsOptimized': True,
1005 1005 'ImageId': image.id,
1006 1006 'InstanceInitiatedShutdownBehavior': 'terminate',
1007 1007 'InstanceType': instance_type,
1008 1008 'KeyName': '%sautomation' % prefix,
1009 1009 'MaxCount': 1,
1010 1010 'MinCount': 1,
1011 1011 'SecurityGroupIds': [c.security_groups['linux-dev-1'].id],
1012 1012 }
1013 1013
1014 1014 with temporary_ec2_instances(c.ec2resource, config) as instances:
1015 1015 wait_for_ip_addresses(instances)
1016 1016
1017 1017 ssh_private_key_path = str(c.key_pair_path_private('automation'))
1018 1018
1019 1019 for instance in instances:
1020 1020 client = wait_for_ssh(
1021 1021 instance.public_ip_address, 22,
1022 1022 username='hg',
1023 1023 key_filename=ssh_private_key_path)
1024 1024
1025 1025 instance.ssh_client = client
1026 1026 instance.ssh_private_key_path = ssh_private_key_path
1027 1027
1028 1028 try:
1029 1029 yield instances
1030 1030 finally:
1031 1031 for instance in instances:
1032 1032 instance.ssh_client.close()
1033 1033
1034 1034
1035 1035 def ensure_windows_dev_ami(c: AWSConnection, prefix='hg-',
1036 1036 base_image_name=WINDOWS_BASE_IMAGE_NAME):
1037 1037 """Ensure Windows Development AMI is available and up-to-date.
1038 1038
1039 1039 If necessary, a modern AMI will be built by starting a temporary EC2
1040 1040 instance and bootstrapping it.
1041 1041
1042 1042 Obsolete AMIs will be deleted so there is only a single AMI having the
1043 1043 desired name.
1044 1044
1045 1045 Returns an ``ec2.Image`` of either an existing AMI or a newly-built
1046 1046 one.
1047 1047 """
1048 1048 ec2client = c.ec2client
1049 1049 ec2resource = c.ec2resource
1050 1050 ssmclient = c.session.client('ssm')
1051 1051
1052 1052 name = '%s%s' % (prefix, 'windows-dev')
1053 1053
1054 1054 image = find_image(ec2resource, AMAZON_ACCOUNT_ID, base_image_name)
1055 1055
1056 1056 config = {
1057 1057 'BlockDeviceMappings': [
1058 1058 {
1059 1059 'DeviceName': '/dev/sda1',
1060 1060 'Ebs': {
1061 1061 'DeleteOnTermination': True,
1062 1062 'VolumeSize': 32,
1063 1063 'VolumeType': 'gp2',
1064 1064 },
1065 1065 }
1066 1066 ],
1067 1067 'ImageId': image.id,
1068 1068 'InstanceInitiatedShutdownBehavior': 'stop',
1069 1069 'InstanceType': 't3.medium',
1070 1070 'KeyName': '%sautomation' % prefix,
1071 1071 'MaxCount': 1,
1072 1072 'MinCount': 1,
1073 1073 'SecurityGroupIds': [c.security_groups['windows-dev-1'].id],
1074 1074 }
1075 1075
1076 1076 commands = [
1077 1077 # Need to start the service so sshd_config is generated.
1078 1078 'Start-Service sshd',
1079 1079 'Write-Output "modifying sshd_config"',
1080 1080 r'$content = Get-Content C:\ProgramData\ssh\sshd_config',
1081 1081 '$content = $content -replace "Match Group administrators","" -replace "AuthorizedKeysFile __PROGRAMDATA__/ssh/administrators_authorized_keys",""',
1082 1082 r'$content | Set-Content C:\ProgramData\ssh\sshd_config',
1083 1083 'Import-Module OpenSSHUtils',
1084 1084 r'Repair-SshdConfigPermission C:\ProgramData\ssh\sshd_config -Confirm:$false',
1085 1085 'Restart-Service sshd',
1086 1086 'Write-Output "installing OpenSSL client"',
1087 1087 'Add-WindowsCapability -Online -Name OpenSSH.Client~~~~0.0.1.0',
1088 1088 'Set-Service -Name sshd -StartupType "Automatic"',
1089 1089 'Write-Output "OpenSSH server running"',
1090 1090 ]
1091 1091
1092 1092 with INSTALL_WINDOWS_DEPENDENCIES.open('r', encoding='utf-8') as fh:
1093 1093 commands.extend(l.rstrip() for l in fh)
1094 1094
1095 1095 # Disable Windows Defender when bootstrapping because it just slows
1096 1096 # things down.
1097 1097 commands.insert(0, 'Set-MpPreference -DisableRealtimeMonitoring $true')
1098 1098 commands.append('Set-MpPreference -DisableRealtimeMonitoring $false')
1099 1099
1100 1100 # Compute a deterministic fingerprint to determine whether image needs
1101 1101 # to be regenerated.
1102 1102 fingerprint = resolve_fingerprint({
1103 1103 'instance_config': config,
1104 1104 'user_data': WINDOWS_USER_DATA,
1105 1105 'initial_bootstrap': WINDOWS_BOOTSTRAP_POWERSHELL,
1106 1106 'bootstrap_commands': commands,
1107 1107 'base_image_name': base_image_name,
1108 1108 })
1109 1109
1110 1110 existing_image = find_and_reconcile_image(ec2resource, name, fingerprint)
1111 1111
1112 1112 if existing_image:
1113 1113 return existing_image
1114 1114
1115 1115 print('no suitable Windows development image found; creating one...')
1116 1116
1117 1117 with create_temp_windows_ec2_instances(c, config) as instances:
1118 1118 assert len(instances) == 1
1119 1119 instance = instances[0]
1120 1120
1121 1121 wait_for_ssm(ssmclient, [instance])
1122 1122
1123 1123 # On first boot, install various Windows updates.
1124 1124 # We would ideally use PowerShell Remoting for this. However, there are
1125 1125 # trust issues that make it difficult to invoke Windows Update
1126 1126 # remotely. So we use SSM, which has a mechanism for running Windows
1127 1127 # Update.
1128 1128 print('installing Windows features...')
1129 1129 run_ssm_command(
1130 1130 ssmclient,
1131 1131 [instance],
1132 1132 'AWS-RunPowerShellScript',
1133 1133 {
1134 1134 'commands': WINDOWS_BOOTSTRAP_POWERSHELL.split('\n'),
1135 1135 },
1136 1136 )
1137 1137
1138 1138 # Reboot so all updates are fully applied.
1139 1139 #
1140 1140 # We don't use instance.reboot() here because it is asynchronous and
1141 1141 # we don't know when exactly the instance has rebooted. It could take
1142 1142 # a while to stop and we may start trying to interact with the instance
1143 1143 # before it has rebooted.
1144 1144 print('rebooting instance %s' % instance.id)
1145 1145 instance.stop()
1146 1146 ec2client.get_waiter('instance_stopped').wait(
1147 1147 InstanceIds=[instance.id],
1148 1148 WaiterConfig={
1149 1149 'Delay': 5,
1150 1150 })
1151 1151
1152 1152 instance.start()
1153 1153 wait_for_ip_addresses([instance])
1154 1154
1155 1155 # There is a race condition here between the User Data PS script running
1156 1156 # and us connecting to WinRM. This can manifest as
1157 1157 # "AuthorizationManager check failed" failures during run_powershell().
1158 1158 # TODO figure out a workaround.
1159 1159
1160 1160 print('waiting for Windows Remote Management to come back...')
1161 1161 client = wait_for_winrm(instance.public_ip_address, 'Administrator',
1162 1162 c.automation.default_password())
1163 1163 print('established WinRM connection to %s' % instance.id)
1164 1164 instance.winrm_client = client
1165 1165
1166 1166 print('bootstrapping instance...')
1167 1167 run_powershell(instance.winrm_client, '\n'.join(commands))
1168 1168
1169 1169 print('bootstrap completed; stopping %s to create image' % instance.id)
1170 1170 return create_ami_from_instance(ec2client, instance, name,
1171 1171 'Mercurial Windows development environment',
1172 1172 fingerprint)
1173 1173
1174 1174
1175 1175 @contextlib.contextmanager
1176 1176 def temporary_windows_dev_instances(c: AWSConnection, image, instance_type,
1177 1177 prefix='hg-', disable_antivirus=False):
1178 1178 """Create a temporary Windows development EC2 instance.
1179 1179
1180 1180 Context manager resolves to the list of ``EC2.Instance`` that were created.
1181 1181 """
1182 1182 config = {
1183 1183 'BlockDeviceMappings': [
1184 1184 {
1185 1185 'DeviceName': '/dev/sda1',
1186 1186 'Ebs': {
1187 1187 'DeleteOnTermination': True,
1188 1188 'VolumeSize': 32,
1189 1189 'VolumeType': 'gp2',
1190 1190 },
1191 1191 }
1192 1192 ],
1193 1193 'ImageId': image.id,
1194 1194 'InstanceInitiatedShutdownBehavior': 'stop',
1195 1195 'InstanceType': instance_type,
1196 1196 'KeyName': '%sautomation' % prefix,
1197 1197 'MaxCount': 1,
1198 1198 'MinCount': 1,
1199 1199 'SecurityGroupIds': [c.security_groups['windows-dev-1'].id],
1200 1200 }
1201 1201
1202 1202 with create_temp_windows_ec2_instances(c, config) as instances:
1203 1203 if disable_antivirus:
1204 1204 for instance in instances:
1205 1205 run_powershell(
1206 1206 instance.winrm_client,
1207 1207 'Set-MpPreference -DisableRealtimeMonitoring $true')
1208 1208
1209 1209 yield instances
General Comments 0
You need to be logged in to leave comments. Login now