ingest #9032
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: ingest | |
on: | |
# push: {} | |
schedule: | |
- cron: "45 */3 * * *" | |
workflow_dispatch: {} | |
concurrency: ingest | |
env: | |
AWS_ROUTE53_HOSTED_ZONE_ID: Z25CPWZR4S9IHS | |
AWS_ROUTE53_TTL: 60 | |
AWS_S3_BASE_URL: s3://datasets.kamu.dev/odf/v2/contrib | |
jobs: | |
update: | |
name: Update Dataset | |
permissions: | |
id-token: write | |
strategy: | |
fail-fast: false | |
# TODO: We use DNSLink instead of IPNS due to IPNS's current usability issues like: | |
# - frequent loss of writes | |
# - lack of read-after-write consistency | |
# - abysmal resolution times and frequent timeouts | |
matrix: | |
include: | |
- dataset_name: com.cryptocompare.ohlcv.eth-usd | |
dnslink_subdomain: _dnslink.com.cryptocompare.ohlcv.eth-usd.ipns.kamu.dev | |
- dataset_name: co.alphavantage.tickers.daily.spy | |
dnslink_subdomain: _dnslink.co.alphavantage.tickers.daily.spy.ipns.kamu.dev | |
- dataset_name: net.rocketpool.reth.tokens-minted | |
dnslink_subdomain: _dnslink.net.rocketpool.reth.tokens-minted.ipns.kamu.dev | |
- dataset_name: net.rocketpool.reth.tokens-burned | |
dnslink_subdomain: _dnslink.net.rocketpool.reth.tokens-burned.ipns.kamu.dev | |
- dataset_name: com.github.kamu-data.clones | |
- dataset_name: com.github.kamu-data.stargazers | |
- dataset_name: com.github.kamu-data.views | |
- dataset_name: com.youtube.channel.kamu-data.stats | |
- dataset_name: com.youtube.channel.kamu-data.videos.stats | |
- dataset_name: com.youtube.playlist.featuring-kamu-data.videos.stats | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout repo | |
uses: actions/checkout@v3 | |
with: | |
path: kamu-contrib | |
- uses: oduwsdl/setup-ipfs@main | |
with: | |
ipfs_version: ^0.29 | |
run_daemon: true | |
- name: Install kamu | |
shell: bash | |
run: | | |
curl -s "https://get.kamu.dev" | sh | |
- name: Print info | |
shell: bash | |
run: | | |
echo "Work dir: $PWD" | |
echo "IPFS version:" | |
ipfs version | |
ipfs id | |
echo "Podman version:" | |
podman version | |
echo "AWS CLI version:" | |
aws --version | |
echo "Kamu version:" | |
kamu --version | |
- uses: actions/cache@v4 | |
with: | |
key: ingest-${{ matrix.dataset_name }} | |
save-always: true | |
path: | | |
.kamu | |
- uses: aws-actions/configure-aws-credentials@v4 | |
with: | |
audience: sts.amazonaws.com | |
aws-region: us-west-2 | |
role-to-assume: ${{ secrets.AWS_IAM_ROLE }} | |
- name: Configure | |
shell: bash | |
run: | | |
# Setup kamu config | |
echo " | |
kind: CLIConfig | |
version: 1 | |
content: | |
source: | |
ethereum: | |
rpcEndpoints: | |
- chainId: 1 | |
chainName: Ethereum Mainnet | |
nodeUrl: ${{ secrets.INFURA_ETH_MAINNET_WS }} | |
" > .kamuconfig | |
# Add pinning service | |
ipfs pin remote service add pinata https://api.pinata.cloud/psa "${{ secrets.PINATA_ACCESS_TOKEN }}" | |
# Create workspace | |
kamu init --exists-ok | |
kamu list --output-format table -w | |
- name: Sync dataset from S3 | |
shell: bash | |
run: | | |
kamu -v pull "$AWS_S3_BASE_URL/${{ matrix.dataset_name }}" --as ${{ matrix.dataset_name }} --no-alias | |
kamu list --output-format table -w | |
- name: Ingest new data | |
shell: bash | |
env: | |
ALPHA_VANTAGE_API_KEY: ${{ secrets.ALPHA_VANTAGE_API_KEY }} | |
YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }} | |
GH_TOKEN: ${{ secrets.GITHUBSTATS_TOKEN }} | |
run: | | |
kamu -v pull --fetch-uncacheable ${{ matrix.dataset_name }} | |
kamu list --output-format table -w | |
- name: Publish dataset to S3 | |
shell: bash | |
run: | | |
kamu push ${{ matrix.dataset_name }} --to "$AWS_S3_BASE_URL/${{ matrix.dataset_name }}" --no-alias | |
- name: Resolve current dataset CID | |
id: resolve-dataset | |
if: ${{ matrix.dnslink_subdomain }} | |
shell: bash | |
run: | | |
ipfs_cid=$( \ | |
aws route53 list-resource-record-sets \ | |
--hosted-zone-id $AWS_ROUTE53_HOSTED_ZONE_ID \ | |
| jq -r ".ResourceRecordSets[] | select(.Name == \"${{ matrix.dnslink_subdomain }}.\") | .ResourceRecords[0].Value" \ | |
| grep -oE '[^/]+"$' \ | |
| grep -oE '^[^"]+' | |
) | |
echo "Current IPFS CID (from DNSLink): $ipfs_cid" | |
echo "CID=$ipfs_cid" >> $GITHUB_OUTPUT | |
- name: Pin dataset in Pinata | |
if: ${{ matrix.dnslink_subdomain }} | |
shell: bash | |
env: | |
OLD_CID: ${{ steps.resolve-dataset.outputs.CID }} | |
PIN_TRIES: "10" | |
run: | | |
# Add dataset to IPFS | |
ipfs_cid=$(kamu -v system ipfs add ${{ matrix.dataset_name }}) | |
echo "New IPFS CID: $ipfs_cid" | |
if [ "$OLD_CID" == "$ipfs_cid" ]; then | |
echo "CID didn't change - skipping publishing" | |
else | |
for i in $(seq 1 $PIN_TRIES); do | |
echo "Pinning CID (attempt $i)" | |
if ipfs pin remote add --service=pinata --name=${{ matrix.dataset_name }} $ipfs_cid; then | |
echo "Successfully pinned" | |
break | |
else | |
echo "Pinning service returned an error" | |
# TODO: Remove this debug stuff when pinning is reliable | |
peers_num=$(ipfs swarm peers | wc -l) | |
echo "IPFS is connected to $peers_num peers" | |
echo "IPFS addresses:" | |
ipfs id | |
if curl -s --fail "https://dweb.link/ipfs/$ipfs_cid" > /dev/null; then | |
echo "CID is RESOLVABLE via public gateway" | |
else | |
echo "CID is NOT resolvable via public gateway" | |
fi | |
if [ $i == $PIN_TRIES ]; then | |
echo "Failed to pin CID after $PIN_TRIES attempts" | |
# TODO: Pinning fails so often that we have to ignore this for now | |
# not to obscure ingest errors. | |
exit 0 | |
fi | |
fi | |
done | |
# Update DNSLink entry | |
echo "{ | |
\"Comment\": \"GHAction update of DNSLink record\", | |
\"Changes\": [ | |
{ | |
\"Action\": \"UPSERT\", | |
\"ResourceRecordSet\": { | |
\"Name\": \"${{ matrix.dnslink_subdomain }}.\", | |
\"Type\": \"TXT\", | |
\"TTL\": $AWS_ROUTE53_TTL, | |
\"ResourceRecords\": [{ | |
\"Value\": \"\\\"dnslink=/ipfs/$ipfs_cid\\\"\" | |
}] | |
} | |
} | |
] | |
}" > dns-upsert.json | |
echo "Updating DNSLink entry with: $(cat dns-upsert.json)" | |
aws route53 change-resource-record-sets --hosted-zone-id $AWS_ROUTE53_HOSTED_ZONE_ID --change-batch file://dns-upsert.json | |
fi | |
- name: Cleanup old pins | |
if: ${{ matrix.dnslink_subdomain }} | |
shell: bash | |
env: | |
PINATA_ACCESS_TOKEN: ${{ secrets.PINATA_ACCESS_TOKEN }} | |
run: | | |
kamu-contrib/.ci/pinata.py unpin-old --name ${{ matrix.dataset_name }} --keep-latest 1 |