##// END OF EJS Templates
automation: shore up rebooting behavior...
Gregory Szorc -
r42466:e570106b default
parent child Browse files
Show More
@@ -1,892 +1,908 b''
1 1 # aws.py - Automation code for Amazon Web Services
2 2 #
3 3 # Copyright 2019 Gregory Szorc <gregory.szorc@gmail.com>
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 # no-check-code because Python 3 native.
9 9
10 10 import contextlib
11 11 import copy
12 12 import hashlib
13 13 import json
14 14 import os
15 15 import pathlib
16 16 import subprocess
17 17 import time
18 18
19 19 import boto3
20 20 import botocore.exceptions
21 21
22 22 from .winrm import (
23 23 run_powershell,
24 24 wait_for_winrm,
25 25 )
26 26
27 27
28 28 SOURCE_ROOT = pathlib.Path(os.path.abspath(__file__)).parent.parent.parent.parent
29 29
30 30 INSTALL_WINDOWS_DEPENDENCIES = (SOURCE_ROOT / 'contrib' /
31 31 'install-windows-dependencies.ps1')
32 32
33 33
34 34 KEY_PAIRS = {
35 35 'automation',
36 36 }
37 37
38 38
39 39 SECURITY_GROUPS = {
40 40 'windows-dev-1': {
41 41 'description': 'Mercurial Windows instances that perform build automation',
42 42 'ingress': [
43 43 {
44 44 'FromPort': 22,
45 45 'ToPort': 22,
46 46 'IpProtocol': 'tcp',
47 47 'IpRanges': [
48 48 {
49 49 'CidrIp': '0.0.0.0/0',
50 50 'Description': 'SSH from entire Internet',
51 51 },
52 52 ],
53 53 },
54 54 {
55 55 'FromPort': 3389,
56 56 'ToPort': 3389,
57 57 'IpProtocol': 'tcp',
58 58 'IpRanges': [
59 59 {
60 60 'CidrIp': '0.0.0.0/0',
61 61 'Description': 'RDP from entire Internet',
62 62 },
63 63 ],
64 64
65 65 },
66 66 {
67 67 'FromPort': 5985,
68 68 'ToPort': 5986,
69 69 'IpProtocol': 'tcp',
70 70 'IpRanges': [
71 71 {
72 72 'CidrIp': '0.0.0.0/0',
73 73 'Description': 'PowerShell Remoting (Windows Remote Management)',
74 74 },
75 75 ],
76 76 }
77 77 ],
78 78 },
79 79 }
80 80
81 81
82 82 IAM_ROLES = {
83 83 'ephemeral-ec2-role-1': {
84 84 'description': 'Mercurial temporary EC2 instances',
85 85 'policy_arns': [
86 86 'arn:aws:iam::aws:policy/service-role/AmazonEC2RoleforSSM',
87 87 ],
88 88 },
89 89 }
90 90
91 91
92 92 ASSUME_ROLE_POLICY_DOCUMENT = '''
93 93 {
94 94 "Version": "2012-10-17",
95 95 "Statement": [
96 96 {
97 97 "Effect": "Allow",
98 98 "Principal": {
99 99 "Service": "ec2.amazonaws.com"
100 100 },
101 101 "Action": "sts:AssumeRole"
102 102 }
103 103 ]
104 104 }
105 105 '''.strip()
106 106
107 107
108 108 IAM_INSTANCE_PROFILES = {
109 109 'ephemeral-ec2-1': {
110 110 'roles': [
111 111 'ephemeral-ec2-role-1',
112 112 ],
113 113 }
114 114 }
115 115
116 116
117 117 # User Data for Windows EC2 instance. Mainly used to set the password
118 118 # and configure WinRM.
119 119 # Inspired by the User Data script used by Packer
120 120 # (from https://www.packer.io/intro/getting-started/build-image.html).
121 121 WINDOWS_USER_DATA = r'''
122 122 <powershell>
123 123
124 124 # TODO enable this once we figure out what is failing.
125 125 #$ErrorActionPreference = "stop"
126 126
127 127 # Set administrator password
128 128 net user Administrator "%s"
129 129 wmic useraccount where "name='Administrator'" set PasswordExpires=FALSE
130 130
131 131 # First, make sure WinRM can't be connected to
132 132 netsh advfirewall firewall set rule name="Windows Remote Management (HTTP-In)" new enable=yes action=block
133 133
134 134 # Delete any existing WinRM listeners
135 135 winrm delete winrm/config/listener?Address=*+Transport=HTTP 2>$Null
136 136 winrm delete winrm/config/listener?Address=*+Transport=HTTPS 2>$Null
137 137
138 138 # Create a new WinRM listener and configure
139 139 winrm create winrm/config/listener?Address=*+Transport=HTTP
140 140 winrm set winrm/config/winrs '@{MaxMemoryPerShellMB="0"}'
141 141 winrm set winrm/config '@{MaxTimeoutms="7200000"}'
142 142 winrm set winrm/config/service '@{AllowUnencrypted="true"}'
143 143 winrm set winrm/config/service '@{MaxConcurrentOperationsPerUser="12000"}'
144 144 winrm set winrm/config/service/auth '@{Basic="true"}'
145 145 winrm set winrm/config/client/auth '@{Basic="true"}'
146 146
147 147 # Configure UAC to allow privilege elevation in remote shells
148 148 $Key = 'HKLM:\SOFTWARE\Microsoft\Windows\CurrentVersion\Policies\System'
149 149 $Setting = 'LocalAccountTokenFilterPolicy'
150 150 Set-ItemProperty -Path $Key -Name $Setting -Value 1 -Force
151 151
152 152 # Configure and restart the WinRM Service; Enable the required firewall exception
153 153 Stop-Service -Name WinRM
154 154 Set-Service -Name WinRM -StartupType Automatic
155 155 netsh advfirewall firewall set rule name="Windows Remote Management (HTTP-In)" new action=allow localip=any remoteip=any
156 156 Start-Service -Name WinRM
157 157
158 158 # Disable firewall on private network interfaces so prompts don't appear.
159 159 Set-NetFirewallProfile -Name private -Enabled false
160 160 </powershell>
161 161 '''.lstrip()
162 162
163 163
164 164 WINDOWS_BOOTSTRAP_POWERSHELL = '''
165 165 Write-Output "installing PowerShell dependencies"
166 166 Install-PackageProvider -Name NuGet -MinimumVersion 2.8.5.201 -Force
167 167 Set-PSRepository -Name PSGallery -InstallationPolicy Trusted
168 168 Install-Module -Name OpenSSHUtils -RequiredVersion 0.0.2.0
169 169
170 170 Write-Output "installing OpenSSL server"
171 171 Add-WindowsCapability -Online -Name OpenSSH.Server~~~~0.0.1.0
172 172 # Various tools will attempt to use older versions of .NET. So we enable
173 173 # the feature that provides them so it doesn't have to be auto-enabled
174 174 # later.
175 175 Write-Output "enabling .NET Framework feature"
176 176 Install-WindowsFeature -Name Net-Framework-Core
177 177 '''
178 178
179 179
180 180 class AWSConnection:
181 181 """Manages the state of a connection with AWS."""
182 182
183 183 def __init__(self, automation, region: str, ensure_ec2_state: bool=True):
184 184 self.automation = automation
185 185 self.local_state_path = automation.state_path
186 186
187 187 self.prefix = 'hg-'
188 188
189 189 self.session = boto3.session.Session(region_name=region)
190 190 self.ec2client = self.session.client('ec2')
191 191 self.ec2resource = self.session.resource('ec2')
192 192 self.iamclient = self.session.client('iam')
193 193 self.iamresource = self.session.resource('iam')
194 194 self.security_groups = {}
195 195
196 196 if ensure_ec2_state:
197 197 ensure_key_pairs(automation.state_path, self.ec2resource)
198 198 self.security_groups = ensure_security_groups(self.ec2resource)
199 199 ensure_iam_state(self.iamclient, self.iamresource)
200 200
201 201 def key_pair_path_private(self, name):
202 202 """Path to a key pair private key file."""
203 203 return self.local_state_path / 'keys' / ('keypair-%s' % name)
204 204
205 205 def key_pair_path_public(self, name):
206 206 return self.local_state_path / 'keys' / ('keypair-%s.pub' % name)
207 207
208 208
209 209 def rsa_key_fingerprint(p: pathlib.Path):
210 210 """Compute the fingerprint of an RSA private key."""
211 211
212 212 # TODO use rsa package.
213 213 res = subprocess.run(
214 214 ['openssl', 'pkcs8', '-in', str(p), '-nocrypt', '-topk8',
215 215 '-outform', 'DER'],
216 216 capture_output=True,
217 217 check=True)
218 218
219 219 sha1 = hashlib.sha1(res.stdout).hexdigest()
220 220 return ':'.join(a + b for a, b in zip(sha1[::2], sha1[1::2]))
221 221
222 222
223 223 def ensure_key_pairs(state_path: pathlib.Path, ec2resource, prefix='hg-'):
224 224 remote_existing = {}
225 225
226 226 for kpi in ec2resource.key_pairs.all():
227 227 if kpi.name.startswith(prefix):
228 228 remote_existing[kpi.name[len(prefix):]] = kpi.key_fingerprint
229 229
230 230 # Validate that we have these keys locally.
231 231 key_path = state_path / 'keys'
232 232 key_path.mkdir(exist_ok=True, mode=0o700)
233 233
234 234 def remove_remote(name):
235 235 print('deleting key pair %s' % name)
236 236 key = ec2resource.KeyPair(name)
237 237 key.delete()
238 238
239 239 def remove_local(name):
240 240 pub_full = key_path / ('keypair-%s.pub' % name)
241 241 priv_full = key_path / ('keypair-%s' % name)
242 242
243 243 print('removing %s' % pub_full)
244 244 pub_full.unlink()
245 245 print('removing %s' % priv_full)
246 246 priv_full.unlink()
247 247
248 248 local_existing = {}
249 249
250 250 for f in sorted(os.listdir(key_path)):
251 251 if not f.startswith('keypair-') or not f.endswith('.pub'):
252 252 continue
253 253
254 254 name = f[len('keypair-'):-len('.pub')]
255 255
256 256 pub_full = key_path / f
257 257 priv_full = key_path / ('keypair-%s' % name)
258 258
259 259 with open(pub_full, 'r', encoding='ascii') as fh:
260 260 data = fh.read()
261 261
262 262 if not data.startswith('ssh-rsa '):
263 263 print('unexpected format for key pair file: %s; removing' %
264 264 pub_full)
265 265 pub_full.unlink()
266 266 priv_full.unlink()
267 267 continue
268 268
269 269 local_existing[name] = rsa_key_fingerprint(priv_full)
270 270
271 271 for name in sorted(set(remote_existing) | set(local_existing)):
272 272 if name not in local_existing:
273 273 actual = '%s%s' % (prefix, name)
274 274 print('remote key %s does not exist locally' % name)
275 275 remove_remote(actual)
276 276 del remote_existing[name]
277 277
278 278 elif name not in remote_existing:
279 279 print('local key %s does not exist remotely' % name)
280 280 remove_local(name)
281 281 del local_existing[name]
282 282
283 283 elif remote_existing[name] != local_existing[name]:
284 284 print('key fingerprint mismatch for %s; '
285 285 'removing from local and remote' % name)
286 286 remove_local(name)
287 287 remove_remote('%s%s' % (prefix, name))
288 288 del local_existing[name]
289 289 del remote_existing[name]
290 290
291 291 missing = KEY_PAIRS - set(remote_existing)
292 292
293 293 for name in sorted(missing):
294 294 actual = '%s%s' % (prefix, name)
295 295 print('creating key pair %s' % actual)
296 296
297 297 priv_full = key_path / ('keypair-%s' % name)
298 298 pub_full = key_path / ('keypair-%s.pub' % name)
299 299
300 300 kp = ec2resource.create_key_pair(KeyName=actual)
301 301
302 302 with priv_full.open('w', encoding='ascii') as fh:
303 303 fh.write(kp.key_material)
304 304 fh.write('\n')
305 305
306 306 priv_full.chmod(0o0600)
307 307
308 308 # SSH public key can be extracted via `ssh-keygen`.
309 309 with pub_full.open('w', encoding='ascii') as fh:
310 310 subprocess.run(
311 311 ['ssh-keygen', '-y', '-f', str(priv_full)],
312 312 stdout=fh,
313 313 check=True)
314 314
315 315 pub_full.chmod(0o0600)
316 316
317 317
318 318 def delete_instance_profile(profile):
319 319 for role in profile.roles:
320 320 print('removing role %s from instance profile %s' % (role.name,
321 321 profile.name))
322 322 profile.remove_role(RoleName=role.name)
323 323
324 324 print('deleting instance profile %s' % profile.name)
325 325 profile.delete()
326 326
327 327
328 328 def ensure_iam_state(iamclient, iamresource, prefix='hg-'):
329 329 """Ensure IAM state is in sync with our canonical definition."""
330 330
331 331 remote_profiles = {}
332 332
333 333 for profile in iamresource.instance_profiles.all():
334 334 if profile.name.startswith(prefix):
335 335 remote_profiles[profile.name[len(prefix):]] = profile
336 336
337 337 for name in sorted(set(remote_profiles) - set(IAM_INSTANCE_PROFILES)):
338 338 delete_instance_profile(remote_profiles[name])
339 339 del remote_profiles[name]
340 340
341 341 remote_roles = {}
342 342
343 343 for role in iamresource.roles.all():
344 344 if role.name.startswith(prefix):
345 345 remote_roles[role.name[len(prefix):]] = role
346 346
347 347 for name in sorted(set(remote_roles) - set(IAM_ROLES)):
348 348 role = remote_roles[name]
349 349
350 350 print('removing role %s' % role.name)
351 351 role.delete()
352 352 del remote_roles[name]
353 353
354 354 # We've purged remote state that doesn't belong. Create missing
355 355 # instance profiles and roles.
356 356 for name in sorted(set(IAM_INSTANCE_PROFILES) - set(remote_profiles)):
357 357 actual = '%s%s' % (prefix, name)
358 358 print('creating IAM instance profile %s' % actual)
359 359
360 360 profile = iamresource.create_instance_profile(
361 361 InstanceProfileName=actual)
362 362 remote_profiles[name] = profile
363 363
364 364 waiter = iamclient.get_waiter('instance_profile_exists')
365 365 waiter.wait(InstanceProfileName=actual)
366 366 print('IAM instance profile %s is available' % actual)
367 367
368 368 for name in sorted(set(IAM_ROLES) - set(remote_roles)):
369 369 entry = IAM_ROLES[name]
370 370
371 371 actual = '%s%s' % (prefix, name)
372 372 print('creating IAM role %s' % actual)
373 373
374 374 role = iamresource.create_role(
375 375 RoleName=actual,
376 376 Description=entry['description'],
377 377 AssumeRolePolicyDocument=ASSUME_ROLE_POLICY_DOCUMENT,
378 378 )
379 379
380 380 waiter = iamclient.get_waiter('role_exists')
381 381 waiter.wait(RoleName=actual)
382 382 print('IAM role %s is available' % actual)
383 383
384 384 remote_roles[name] = role
385 385
386 386 for arn in entry['policy_arns']:
387 387 print('attaching policy %s to %s' % (arn, role.name))
388 388 role.attach_policy(PolicyArn=arn)
389 389
390 390 # Now reconcile state of profiles.
391 391 for name, meta in sorted(IAM_INSTANCE_PROFILES.items()):
392 392 profile = remote_profiles[name]
393 393 wanted = {'%s%s' % (prefix, role) for role in meta['roles']}
394 394 have = {role.name for role in profile.roles}
395 395
396 396 for role in sorted(have - wanted):
397 397 print('removing role %s from %s' % (role, profile.name))
398 398 profile.remove_role(RoleName=role)
399 399
400 400 for role in sorted(wanted - have):
401 401 print('adding role %s to %s' % (role, profile.name))
402 402 profile.add_role(RoleName=role)
403 403
404 404
405 405 def find_windows_server_2019_image(ec2resource):
406 406 """Find the Amazon published Windows Server 2019 base image."""
407 407
408 408 images = ec2resource.images.filter(
409 409 Filters=[
410 410 {
411 411 'Name': 'owner-alias',
412 412 'Values': ['amazon'],
413 413 },
414 414 {
415 415 'Name': 'state',
416 416 'Values': ['available'],
417 417 },
418 418 {
419 419 'Name': 'image-type',
420 420 'Values': ['machine'],
421 421 },
422 422 {
423 423 'Name': 'name',
424 424 'Values': ['Windows_Server-2019-English-Full-Base-2019.02.13'],
425 425 },
426 426 ])
427 427
428 428 for image in images:
429 429 return image
430 430
431 431 raise Exception('unable to find Windows Server 2019 image')
432 432
433 433
434 434 def ensure_security_groups(ec2resource, prefix='hg-'):
435 435 """Ensure all necessary Mercurial security groups are present.
436 436
437 437 All security groups are prefixed with ``hg-`` by default. Any security
438 438 groups having this prefix but aren't in our list are deleted.
439 439 """
440 440 existing = {}
441 441
442 442 for group in ec2resource.security_groups.all():
443 443 if group.group_name.startswith(prefix):
444 444 existing[group.group_name[len(prefix):]] = group
445 445
446 446 purge = set(existing) - set(SECURITY_GROUPS)
447 447
448 448 for name in sorted(purge):
449 449 group = existing[name]
450 450 print('removing legacy security group: %s' % group.group_name)
451 451 group.delete()
452 452
453 453 security_groups = {}
454 454
455 455 for name, group in sorted(SECURITY_GROUPS.items()):
456 456 if name in existing:
457 457 security_groups[name] = existing[name]
458 458 continue
459 459
460 460 actual = '%s%s' % (prefix, name)
461 461 print('adding security group %s' % actual)
462 462
463 463 group_res = ec2resource.create_security_group(
464 464 Description=group['description'],
465 465 GroupName=actual,
466 466 )
467 467
468 468 group_res.authorize_ingress(
469 469 IpPermissions=group['ingress'],
470 470 )
471 471
472 472 security_groups[name] = group_res
473 473
474 474 return security_groups
475 475
476 476
477 477 def terminate_ec2_instances(ec2resource, prefix='hg-'):
478 478 """Terminate all EC2 instances managed by us."""
479 479 waiting = []
480 480
481 481 for instance in ec2resource.instances.all():
482 482 if instance.state['Name'] == 'terminated':
483 483 continue
484 484
485 485 for tag in instance.tags or []:
486 486 if tag['Key'] == 'Name' and tag['Value'].startswith(prefix):
487 487 print('terminating %s' % instance.id)
488 488 instance.terminate()
489 489 waiting.append(instance)
490 490
491 491 for instance in waiting:
492 492 instance.wait_until_terminated()
493 493
494 494
495 495 def remove_resources(c, prefix='hg-'):
496 496 """Purge all of our resources in this EC2 region."""
497 497 ec2resource = c.ec2resource
498 498 iamresource = c.iamresource
499 499
500 500 terminate_ec2_instances(ec2resource, prefix=prefix)
501 501
502 502 for image in ec2resource.images.filter(Owners=['self']):
503 503 if image.name.startswith(prefix):
504 504 remove_ami(ec2resource, image)
505 505
506 506 for group in ec2resource.security_groups.all():
507 507 if group.group_name.startswith(prefix):
508 508 print('removing security group %s' % group.group_name)
509 509 group.delete()
510 510
511 511 for profile in iamresource.instance_profiles.all():
512 512 if profile.name.startswith(prefix):
513 513 delete_instance_profile(profile)
514 514
515 515 for role in iamresource.roles.all():
516 516 if role.name.startswith(prefix):
517 517 for p in role.attached_policies.all():
518 518 print('detaching policy %s from %s' % (p.arn, role.name))
519 519 role.detach_policy(PolicyArn=p.arn)
520 520
521 521 print('removing role %s' % role.name)
522 522 role.delete()
523 523
524 524
525 525 def wait_for_ip_addresses(instances):
526 526 """Wait for the public IP addresses of an iterable of instances."""
527 527 for instance in instances:
528 528 while True:
529 529 if not instance.public_ip_address:
530 530 time.sleep(2)
531 531 instance.reload()
532 532 continue
533 533
534 534 print('public IP address for %s: %s' % (
535 535 instance.id, instance.public_ip_address))
536 536 break
537 537
538 538
539 539 def remove_ami(ec2resource, image):
540 540 """Remove an AMI and its underlying snapshots."""
541 541 snapshots = []
542 542
543 543 for device in image.block_device_mappings:
544 544 if 'Ebs' in device:
545 545 snapshots.append(ec2resource.Snapshot(device['Ebs']['SnapshotId']))
546 546
547 547 print('deregistering %s' % image.id)
548 548 image.deregister()
549 549
550 550 for snapshot in snapshots:
551 551 print('deleting snapshot %s' % snapshot.id)
552 552 snapshot.delete()
553 553
554 554
555 555 def wait_for_ssm(ssmclient, instances):
556 556 """Wait for SSM to come online for an iterable of instance IDs."""
557 557 while True:
558 558 res = ssmclient.describe_instance_information(
559 559 Filters=[
560 560 {
561 561 'Key': 'InstanceIds',
562 562 'Values': [i.id for i in instances],
563 563 },
564 564 ],
565 565 )
566 566
567 567 available = len(res['InstanceInformationList'])
568 568 wanted = len(instances)
569 569
570 570 print('%d/%d instances available in SSM' % (available, wanted))
571 571
572 572 if available == wanted:
573 573 return
574 574
575 575 time.sleep(2)
576 576
577 577
578 578 def run_ssm_command(ssmclient, instances, document_name, parameters):
579 579 """Run a PowerShell script on an EC2 instance."""
580 580
581 581 res = ssmclient.send_command(
582 582 InstanceIds=[i.id for i in instances],
583 583 DocumentName=document_name,
584 584 Parameters=parameters,
585 585 CloudWatchOutputConfig={
586 586 'CloudWatchOutputEnabled': True,
587 587 },
588 588 )
589 589
590 590 command_id = res['Command']['CommandId']
591 591
592 592 for instance in instances:
593 593 while True:
594 594 try:
595 595 res = ssmclient.get_command_invocation(
596 596 CommandId=command_id,
597 597 InstanceId=instance.id,
598 598 )
599 599 except botocore.exceptions.ClientError as e:
600 600 if e.response['Error']['Code'] == 'InvocationDoesNotExist':
601 601 print('could not find SSM command invocation; waiting')
602 602 time.sleep(1)
603 603 continue
604 604 else:
605 605 raise
606 606
607 607 if res['Status'] == 'Success':
608 608 break
609 609 elif res['Status'] in ('Pending', 'InProgress', 'Delayed'):
610 610 time.sleep(2)
611 611 else:
612 612 raise Exception('command failed on %s: %s' % (
613 613 instance.id, res['Status']))
614 614
615 615
616 616 @contextlib.contextmanager
617 617 def temporary_ec2_instances(ec2resource, config):
618 618 """Create temporary EC2 instances.
619 619
620 620 This is a proxy to ``ec2client.run_instances(**config)`` that takes care of
621 621 managing the lifecycle of the instances.
622 622
623 623 When the context manager exits, the instances are terminated.
624 624
625 625 The context manager evaluates to the list of data structures
626 626 describing each created instance. The instances may not be available
627 627 for work immediately: it is up to the caller to wait for the instance
628 628 to start responding.
629 629 """
630 630
631 631 ids = None
632 632
633 633 try:
634 634 res = ec2resource.create_instances(**config)
635 635
636 636 ids = [i.id for i in res]
637 637 print('started instances: %s' % ' '.join(ids))
638 638
639 639 yield res
640 640 finally:
641 641 if ids:
642 642 print('terminating instances: %s' % ' '.join(ids))
643 643 for instance in res:
644 644 instance.terminate()
645 645 print('terminated %d instances' % len(ids))
646 646
647 647
648 648 @contextlib.contextmanager
649 649 def create_temp_windows_ec2_instances(c: AWSConnection, config):
650 650 """Create temporary Windows EC2 instances.
651 651
652 652 This is a higher-level wrapper around ``create_temp_ec2_instances()`` that
653 653 configures the Windows instance for Windows Remote Management. The emitted
654 654 instances will have a ``winrm_client`` attribute containing a
655 655 ``pypsrp.client.Client`` instance bound to the instance.
656 656 """
657 657 if 'IamInstanceProfile' in config:
658 658 raise ValueError('IamInstanceProfile cannot be provided in config')
659 659 if 'UserData' in config:
660 660 raise ValueError('UserData cannot be provided in config')
661 661
662 662 password = c.automation.default_password()
663 663
664 664 config = copy.deepcopy(config)
665 665 config['IamInstanceProfile'] = {
666 666 'Name': 'hg-ephemeral-ec2-1',
667 667 }
668 668 config.setdefault('TagSpecifications', []).append({
669 669 'ResourceType': 'instance',
670 670 'Tags': [{'Key': 'Name', 'Value': 'hg-temp-windows'}],
671 671 })
672 672 config['UserData'] = WINDOWS_USER_DATA % password
673 673
674 674 with temporary_ec2_instances(c.ec2resource, config) as instances:
675 675 wait_for_ip_addresses(instances)
676 676
677 677 print('waiting for Windows Remote Management service...')
678 678
679 679 for instance in instances:
680 680 client = wait_for_winrm(instance.public_ip_address, 'Administrator', password)
681 681 print('established WinRM connection to %s' % instance.id)
682 682 instance.winrm_client = client
683 683
684 684 yield instances
685 685
686 686
687 687 def ensure_windows_dev_ami(c: AWSConnection, prefix='hg-'):
688 688 """Ensure Windows Development AMI is available and up-to-date.
689 689
690 690 If necessary, a modern AMI will be built by starting a temporary EC2
691 691 instance and bootstrapping it.
692 692
693 693 Obsolete AMIs will be deleted so there is only a single AMI having the
694 694 desired name.
695 695
696 696 Returns an ``ec2.Image`` of either an existing AMI or a newly-built
697 697 one.
698 698 """
699 699 ec2client = c.ec2client
700 700 ec2resource = c.ec2resource
701 701 ssmclient = c.session.client('ssm')
702 702
703 703 name = '%s%s' % (prefix, 'windows-dev')
704 704
705 705 config = {
706 706 'BlockDeviceMappings': [
707 707 {
708 708 'DeviceName': '/dev/sda1',
709 709 'Ebs': {
710 710 'DeleteOnTermination': True,
711 711 'VolumeSize': 32,
712 712 'VolumeType': 'gp2',
713 713 },
714 714 }
715 715 ],
716 716 'ImageId': find_windows_server_2019_image(ec2resource).id,
717 717 'InstanceInitiatedShutdownBehavior': 'stop',
718 718 'InstanceType': 't3.medium',
719 719 'KeyName': '%sautomation' % prefix,
720 720 'MaxCount': 1,
721 721 'MinCount': 1,
722 722 'SecurityGroupIds': [c.security_groups['windows-dev-1'].id],
723 723 }
724 724
725 725 commands = [
726 726 # Need to start the service so sshd_config is generated.
727 727 'Start-Service sshd',
728 728 'Write-Output "modifying sshd_config"',
729 729 r'$content = Get-Content C:\ProgramData\ssh\sshd_config',
730 730 '$content = $content -replace "Match Group administrators","" -replace "AuthorizedKeysFile __PROGRAMDATA__/ssh/administrators_authorized_keys",""',
731 731 r'$content | Set-Content C:\ProgramData\ssh\sshd_config',
732 732 'Import-Module OpenSSHUtils',
733 733 r'Repair-SshdConfigPermission C:\ProgramData\ssh\sshd_config -Confirm:$false',
734 734 'Restart-Service sshd',
735 735 'Write-Output "installing OpenSSL client"',
736 736 'Add-WindowsCapability -Online -Name OpenSSH.Client~~~~0.0.1.0',
737 737 'Set-Service -Name sshd -StartupType "Automatic"',
738 738 'Write-Output "OpenSSH server running"',
739 739 ]
740 740
741 741 with INSTALL_WINDOWS_DEPENDENCIES.open('r', encoding='utf-8') as fh:
742 742 commands.extend(l.rstrip() for l in fh)
743 743
744 744 # Disable Windows Defender when bootstrapping because it just slows
745 745 # things down.
746 746 commands.insert(0, 'Set-MpPreference -DisableRealtimeMonitoring $true')
747 747 commands.append('Set-MpPreference -DisableRealtimeMonitoring $false')
748 748
749 749 # Compute a deterministic fingerprint to determine whether image needs
750 750 # to be regenerated.
751 751 fingerprint = {
752 752 'instance_config': config,
753 753 'user_data': WINDOWS_USER_DATA,
754 754 'initial_bootstrap': WINDOWS_BOOTSTRAP_POWERSHELL,
755 755 'bootstrap_commands': commands,
756 756 }
757 757
758 758 fingerprint = json.dumps(fingerprint, sort_keys=True)
759 759 fingerprint = hashlib.sha256(fingerprint.encode('utf-8')).hexdigest()
760 760
761 761 # Find existing AMIs with this name and delete the ones that are invalid.
762 762 # Store a reference to a good image so it can be returned one the
763 763 # image state is reconciled.
764 764 images = ec2resource.images.filter(
765 765 Filters=[{'Name': 'name', 'Values': [name]}])
766 766
767 767 existing_image = None
768 768
769 769 for image in images:
770 770 if image.tags is None:
771 771 print('image %s for %s lacks required tags; removing' % (
772 772 image.id, image.name))
773 773 remove_ami(ec2resource, image)
774 774 else:
775 775 tags = {t['Key']: t['Value'] for t in image.tags}
776 776
777 777 if tags.get('HGIMAGEFINGERPRINT') == fingerprint:
778 778 existing_image = image
779 779 else:
780 780 print('image %s for %s has wrong fingerprint; removing' % (
781 781 image.id, image.name))
782 782 remove_ami(ec2resource, image)
783 783
784 784 if existing_image:
785 785 return existing_image
786 786
787 787 print('no suitable Windows development image found; creating one...')
788 788
789 789 with create_temp_windows_ec2_instances(c, config) as instances:
790 790 assert len(instances) == 1
791 791 instance = instances[0]
792 792
793 793 wait_for_ssm(ssmclient, [instance])
794 794
795 795 # On first boot, install various Windows updates.
796 796 # We would ideally use PowerShell Remoting for this. However, there are
797 797 # trust issues that make it difficult to invoke Windows Update
798 798 # remotely. So we use SSM, which has a mechanism for running Windows
799 799 # Update.
800 800 print('installing Windows features...')
801 801 run_ssm_command(
802 802 ssmclient,
803 803 [instance],
804 804 'AWS-RunPowerShellScript',
805 805 {
806 806 'commands': WINDOWS_BOOTSTRAP_POWERSHELL.split('\n'),
807 807 },
808 808 )
809 809
810 810 # Reboot so all updates are fully applied.
811 #
812 # We don't use instance.reboot() here because it is asynchronous and
813 # we don't know when exactly the instance has rebooted. It could take
814 # a while to stop and we may start trying to interact with the instance
815 # before it has rebooted.
811 816 print('rebooting instance %s' % instance.id)
812 ec2client.reboot_instances(InstanceIds=[instance.id])
817 instance.stop()
818 ec2client.get_waiter('instance_stopped').wait(
819 InstanceIds=[instance.id],
820 WaiterConfig={
821 'Delay': 5,
822 })
813 823
814 time.sleep(15)
824 instance.start()
825 wait_for_ip_addresses([instance])
826
827 # There is a race condition here between the User Data PS script running
828 # and us connecting to WinRM. This can manifest as
829 # "AuthorizationManager check failed" failures during run_powershell().
830 # TODO figure out a workaround.
815 831
816 832 print('waiting for Windows Remote Management to come back...')
817 833 client = wait_for_winrm(instance.public_ip_address, 'Administrator',
818 834 c.automation.default_password())
819 835 print('established WinRM connection to %s' % instance.id)
820 836 instance.winrm_client = client
821 837
822 838 print('bootstrapping instance...')
823 839 run_powershell(instance.winrm_client, '\n'.join(commands))
824 840
825 841 print('bootstrap completed; stopping %s to create image' % instance.id)
826 842 instance.stop()
827 843
828 844 ec2client.get_waiter('instance_stopped').wait(
829 845 InstanceIds=[instance.id],
830 846 WaiterConfig={
831 847 'Delay': 5,
832 848 })
833 849 print('%s is stopped' % instance.id)
834 850
835 851 image = instance.create_image(
836 852 Name=name,
837 853 Description='Mercurial Windows development environment',
838 854 )
839 855
840 856 image.create_tags(Tags=[
841 857 {
842 858 'Key': 'HGIMAGEFINGERPRINT',
843 859 'Value': fingerprint,
844 860 },
845 861 ])
846 862
847 863 print('waiting for image %s' % image.id)
848 864
849 865 ec2client.get_waiter('image_available').wait(
850 866 ImageIds=[image.id],
851 867 )
852 868
853 869 print('image %s available as %s' % (image.id, image.name))
854 870
855 871 return image
856 872
857 873
858 874 @contextlib.contextmanager
859 875 def temporary_windows_dev_instances(c: AWSConnection, image, instance_type,
860 876 prefix='hg-', disable_antivirus=False):
861 877 """Create a temporary Windows development EC2 instance.
862 878
863 879 Context manager resolves to the list of ``EC2.Instance`` that were created.
864 880 """
865 881 config = {
866 882 'BlockDeviceMappings': [
867 883 {
868 884 'DeviceName': '/dev/sda1',
869 885 'Ebs': {
870 886 'DeleteOnTermination': True,
871 887 'VolumeSize': 32,
872 888 'VolumeType': 'gp2',
873 889 },
874 890 }
875 891 ],
876 892 'ImageId': image.id,
877 893 'InstanceInitiatedShutdownBehavior': 'stop',
878 894 'InstanceType': instance_type,
879 895 'KeyName': '%sautomation' % prefix,
880 896 'MaxCount': 1,
881 897 'MinCount': 1,
882 898 'SecurityGroupIds': [c.security_groups['windows-dev-1'].id],
883 899 }
884 900
885 901 with create_temp_windows_ec2_instances(c, config) as instances:
886 902 if disable_antivirus:
887 903 for instance in instances:
888 904 run_powershell(
889 905 instance.winrm_client,
890 906 'Set-MpPreference -DisableRealtimeMonitoring $true')
891 907
892 908 yield instances
General Comments 0
You need to be logged in to leave comments. Login now