feat(infra): add support for autogenerate CI runners

Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron Pham
2024-06-11 09:27:09 -04:00
parent a5995a6bb8
commit eaf5dafca9
3 changed files with 49 additions and 22 deletions

View File

@@ -26,15 +26,17 @@ jobs:
- name: setup tooling
run: |
python -m pip install uv
uv pip install --system httpx
uv pip install --system httpx orjson
- name: startup machine
run: python tools/machines.py --start ${{ secrets.PAPERSPACE_MACHINE_ID }} || true
id: paperspace-machine
run: |
echo "$(python tools/machines.py --ci-template ${{secrets.PAPERSPACE_TEMPLATE_ID}})" >> $GITHUB_OUTPUT
- name: running regression tests (PR)
uses: appleboy/ssh-action@029f5b4aeeeb58fdfe1410a5d17f967dacf36262 # ratchet:appleboy/ssh-action@v1.0.3
if: github.event_name == 'pull_request'
with:
host: ${{secrets.PAPERSPACE_HOST}}
username: ${{secrets.PAPERSPACE_USERNAME}}
host: ${{steps.outputs.paperspace-machine.outputs.publicIp}}
username: paperspace
key: ${{secrets.PAPERSPACE_SSH_KEY}}
port: ${{secrets.PAPERSPACE_PORT}}
script: /nix/var/nix/profiles/default/bin/nix run github:aarnphm/dix#openllm-ci --refresh -- --pr ${{github.event.number}}
@@ -42,11 +44,13 @@ jobs:
uses: appleboy/ssh-action@029f5b4aeeeb58fdfe1410a5d17f967dacf36262 # ratchet:appleboy/ssh-action@v1.0.3
if: ${{ !github.event.repository.fork && github.event_name == 'push' }}
with:
host: ${{secrets.PAPERSPACE_HOST}}
username: ${{secrets.PAPERSPACE_USERNAME}}
host: ${{steps.outputs.paperspace-machine.outputs.ip}}
username: paperspace
key: ${{secrets.PAPERSPACE_SSH_KEY}}
port: ${{secrets.PAPERSPACE_PORT}}
script: /nix/var/nix/profiles/default/bin/nix run github:aarnphm/dix#openllm-ci --refresh -- --head
- name: shutdown machine
run: python tools/machines.py --delete ${{ steps.outputs.paperspace-machine.outputs.ip }}
evergreen: # https://github.com/marketplace/actions/alls-green#why
if: always()
needs:

View File

@@ -1,19 +1,15 @@
from __future__ import annotations
import httpx,os,dataclasses,logging,time,argparse,typing as t
import httpx,os,dataclasses,datetime,time,argparse,typing as t
if (ENV := os.getenv("PAPERSPACE_API_KEY")) is None: raise RuntimeError('This script requires setting "PAPERSPACE_API_KEY"')
HEADERS = httpx.Headers({'Authorization': f'Bearer {ENV}', 'Accept': 'application/json'})
API_URL = 'https://api.paperspace.com/v1'
logging.basicConfig(level=logging.ERROR)
logger = logging.getLogger(__name__)
@dataclasses.dataclass
class Machine:
id: str
inner: httpx.Client = dataclasses.field(default_factory=lambda: httpx.Client(headers=HEADERS, base_url=API_URL, timeout=60), repr=False)
def close(self): self.inner.close()
def __del__(self): self.close()
def __enter__(self): return self
@@ -25,44 +21,70 @@ class Machine:
def start(self) -> bool:
response = self.inner.patch(f'/machines/{self.id}/start')
if response.status_code == 400 or self.status == 'ready':
logger.error('machine is already running')
print('machine is already running')
return False
elif response.status_code != 200:
logger.error('Error while starting machine "%s": %s', self.id, response.json())
elif response.status_code != 200: raise ValueError(f'Error while starting machine: {response.json()}')
return True
def stop(self) -> bool:
response = self.inner.patch(f'/machines/{self.id}/stop')
if response.status_code == 400 or self.status == 'off':
logger.error('machine is already off')
print('machine is already off')
return False
elif response.status_code != 200:
logger.error('Error while stopping machine "%s": %s', self.id, response.json())
elif response.status_code != 200: raise ValueError(f'Error while stopping machine {response.json()}')
return True
@classmethod
def ci(cls, template_id: str):
client = httpx.Client(headers=HEADERS, base_url=API_URL, timeout=60)
machines = client.get('/machines', params=dict(limit=1, name='openllm-ci')).json()
if len(machines['items']) == 1:
return cls(id=machines['items'][0]['id'], inner=client)
response = client.post('/machines', json=dict(
name=f'openllm-ci-{datetime.datetime.now().timestamp()}',
machineType='A100-80G', templateId=template_id,
networkId=os.getenv("PAPERSPACE_NETWORK_ID"),
diskSize=500, region='ny2', publicIpType="dynamic", startOnCreate=True,
))
if response.status_code != 200: raise ValueError(f'Failed while creating a machine: {response.json()}')
return cls(id=response.json()['data']['id'], inner=client)
def actions(self): return f'publicIp={self.metadata["publicIp"]}'
def main():
parser = argparse.ArgumentParser()
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--start', metavar='ID')
group.add_argument('--stop', metavar='ID')
group.add_argument('--delete', metavar='ID')
group.add_argument('--ci-template', metavar='ID')
args = parser.parse_args()
if args.start:
if args.ci_template:
machine = Machine.ci(args.ci_template)
while machine.status != 'ready': time.sleep(5)
print(machine.actions())
machine.close()
elif args.delete:
with httpx.Client(headers=HEADERS, base_url=API_URL, timeout=60) as client:
response = client.delete(f'/machines/{args.delete}')
if response.status_code != 200:
print('Error while deleting machine %s', response.json())
return 1
elif args.start:
with Machine(id=args.start) as machine:
if machine.start():
while machine.status != 'ready':
logger.info('Waiting for machine "%s" to be ready...', machine.id)
print('Waiting for machine "%s" to be ready...', machine.id)
time.sleep(5)
else:
logger.error('Failed to start machine "%s"', machine.id)
print('Failed to start machine "%s"', machine.id)
return 1
elif args.stop:
with Machine(id=args.stop) as machine:
if machine.stop():
while machine.status != 'ready':
logger.info('Waiting for machine "%s" to stop...', machine.id)
print('Waiting for machine "%s" to stop...', machine.id)
time.sleep(5)
else:
logger.error('Failed to stopmachine "%s"', machine.id)
print('Failed to stop machine "%s"', machine.id)
return 1
return 0

View File

@@ -3,3 +3,4 @@ jupyter
tomlkit
ghapi
pre-commit
orjson