From eaf5dafca91daa10b865944cc3ea217e6dd8090c Mon Sep 17 00:00:00 2001 From: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Date: Tue, 11 Jun 2024 09:27:09 -0400 Subject: [PATCH] feat(infra): add support for autogenerate CI runners Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> --- .github/workflows/ci.yml | 16 +++++++----- tools/machines.py | 54 ++++++++++++++++++++++++++++------------ tools/requirements.txt | 1 + 3 files changed, 49 insertions(+), 22 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5436930b..29265ad7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,15 +26,17 @@ jobs: - name: setup tooling run: | python -m pip install uv - uv pip install --system httpx + uv pip install --system httpx orjson - name: startup machine - run: python tools/machines.py --start ${{ secrets.PAPERSPACE_MACHINE_ID }} || true + id: paperspace-machine + run: | + echo "$(python tools/machines.py --ci-template ${{secrets.PAPERSPACE_TEMPLATE_ID}})" >> $GITHUB_OUTPUT - name: running regression tests (PR) uses: appleboy/ssh-action@029f5b4aeeeb58fdfe1410a5d17f967dacf36262 # ratchet:appleboy/ssh-action@v1.0.3 if: github.event_name == 'pull_request' with: - host: ${{secrets.PAPERSPACE_HOST}} - username: ${{secrets.PAPERSPACE_USERNAME}} + host: ${{steps.outputs.paperspace-machine.outputs.publicIp}} + username: paperspace key: ${{secrets.PAPERSPACE_SSH_KEY}} port: ${{secrets.PAPERSPACE_PORT}} script: /nix/var/nix/profiles/default/bin/nix run github:aarnphm/dix#openllm-ci --refresh -- --pr ${{github.event.number}} @@ -42,11 +44,13 @@ jobs: uses: appleboy/ssh-action@029f5b4aeeeb58fdfe1410a5d17f967dacf36262 # ratchet:appleboy/ssh-action@v1.0.3 if: ${{ !github.event.repository.fork && github.event_name == 'push' }} with: - host: ${{secrets.PAPERSPACE_HOST}} - username: ${{secrets.PAPERSPACE_USERNAME}} + host: ${{steps.outputs.paperspace-machine.outputs.ip}} + username: paperspace key: ${{secrets.PAPERSPACE_SSH_KEY}} port: ${{secrets.PAPERSPACE_PORT}} script: /nix/var/nix/profiles/default/bin/nix run github:aarnphm/dix#openllm-ci --refresh -- --head + - name: shutdown machine + run: python tools/machines.py --delete ${{ steps.outputs.paperspace-machine.outputs.ip }} evergreen: # https://github.com/marketplace/actions/alls-green#why if: always() needs: diff --git a/tools/machines.py b/tools/machines.py index 02d60539..252ec28a 100644 --- a/tools/machines.py +++ b/tools/machines.py @@ -1,19 +1,15 @@ from __future__ import annotations -import httpx,os,dataclasses,logging,time,argparse,typing as t +import httpx,os,dataclasses,datetime,time,argparse,typing as t if (ENV := os.getenv("PAPERSPACE_API_KEY")) is None: raise RuntimeError('This script requires setting "PAPERSPACE_API_KEY"') HEADERS = httpx.Headers({'Authorization': f'Bearer {ENV}', 'Accept': 'application/json'}) API_URL = 'https://api.paperspace.com/v1' -logging.basicConfig(level=logging.ERROR) -logger = logging.getLogger(__name__) - @dataclasses.dataclass class Machine: id: str inner: httpx.Client = dataclasses.field(default_factory=lambda: httpx.Client(headers=HEADERS, base_url=API_URL, timeout=60), repr=False) - def close(self): self.inner.close() def __del__(self): self.close() def __enter__(self): return self @@ -25,44 +21,70 @@ class Machine: def start(self) -> bool: response = self.inner.patch(f'/machines/{self.id}/start') if response.status_code == 400 or self.status == 'ready': - logger.error('machine is already running') + print('machine is already running') return False - elif response.status_code != 200: - logger.error('Error while starting machine "%s": %s', self.id, response.json()) + elif response.status_code != 200: raise ValueError(f'Error while starting machine: {response.json()}') return True def stop(self) -> bool: response = self.inner.patch(f'/machines/{self.id}/stop') if response.status_code == 400 or self.status == 'off': - logger.error('machine is already off') + print('machine is already off') return False - elif response.status_code != 200: - logger.error('Error while stopping machine "%s": %s', self.id, response.json()) + elif response.status_code != 200: raise ValueError(f'Error while stopping machine {response.json()}') return True + @classmethod + def ci(cls, template_id: str): + client = httpx.Client(headers=HEADERS, base_url=API_URL, timeout=60) + machines = client.get('/machines', params=dict(limit=1, name='openllm-ci')).json() + if len(machines['items']) == 1: + return cls(id=machines['items'][0]['id'], inner=client) + response = client.post('/machines', json=dict( + name=f'openllm-ci-{datetime.datetime.now().timestamp()}', + machineType='A100-80G', templateId=template_id, + networkId=os.getenv("PAPERSPACE_NETWORK_ID"), + diskSize=500, region='ny2', publicIpType="dynamic", startOnCreate=True, + )) + if response.status_code != 200: raise ValueError(f'Failed while creating a machine: {response.json()}') + return cls(id=response.json()['data']['id'], inner=client) + def actions(self): return f'publicIp={self.metadata["publicIp"]}' def main(): parser = argparse.ArgumentParser() group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--start', metavar='ID') group.add_argument('--stop', metavar='ID') + group.add_argument('--delete', metavar='ID') + group.add_argument('--ci-template', metavar='ID') args = parser.parse_args() - if args.start: + if args.ci_template: + machine = Machine.ci(args.ci_template) + while machine.status != 'ready': time.sleep(5) + print(machine.actions()) + machine.close() + elif args.delete: + with httpx.Client(headers=HEADERS, base_url=API_URL, timeout=60) as client: + response = client.delete(f'/machines/{args.delete}') + if response.status_code != 200: + print('Error while deleting machine %s', response.json()) + return 1 + elif args.start: with Machine(id=args.start) as machine: if machine.start(): while machine.status != 'ready': - logger.info('Waiting for machine "%s" to be ready...', machine.id) + print('Waiting for machine "%s" to be ready...', machine.id) time.sleep(5) else: - logger.error('Failed to start machine "%s"', machine.id) + print('Failed to start machine "%s"', machine.id) return 1 elif args.stop: with Machine(id=args.stop) as machine: if machine.stop(): while machine.status != 'ready': - logger.info('Waiting for machine "%s" to stop...', machine.id) + print('Waiting for machine "%s" to stop...', machine.id) time.sleep(5) else: - logger.error('Failed to stopmachine "%s"', machine.id) + print('Failed to stop machine "%s"', machine.id) return 1 return 0 diff --git a/tools/requirements.txt b/tools/requirements.txt index 4c84a45c..b762f050 100644 --- a/tools/requirements.txt +++ b/tools/requirements.txt @@ -3,3 +3,4 @@ jupyter tomlkit ghapi pre-commit +orjson