Skip to content

ingest

ingest #9040

Workflow file for this run

name: ingest
on:
# push: {}
schedule:
- cron: "45 */3 * * *"
workflow_dispatch: {}
concurrency: ingest
env:
AWS_ROUTE53_HOSTED_ZONE_ID: Z25CPWZR4S9IHS
AWS_ROUTE53_TTL: 60
AWS_S3_BASE_URL: s3://datasets.kamu.dev/odf/v2/contrib
jobs:
update:
name: Update Dataset
permissions:
id-token: write
strategy:
fail-fast: false
# TODO: We use DNSLink instead of IPNS due to IPNS's current usability issues like:
# - frequent loss of writes
# - lack of read-after-write consistency
# - abysmal resolution times and frequent timeouts
matrix:
include:
- dataset_name: com.cryptocompare.ohlcv.eth-usd
dnslink_subdomain: _dnslink.com.cryptocompare.ohlcv.eth-usd.ipns.kamu.dev
- dataset_name: co.alphavantage.tickers.daily.spy
dnslink_subdomain: _dnslink.co.alphavantage.tickers.daily.spy.ipns.kamu.dev
- dataset_name: net.rocketpool.reth.tokens-minted
dnslink_subdomain: _dnslink.net.rocketpool.reth.tokens-minted.ipns.kamu.dev
- dataset_name: net.rocketpool.reth.tokens-burned
dnslink_subdomain: _dnslink.net.rocketpool.reth.tokens-burned.ipns.kamu.dev
- dataset_name: com.github.kamu-data.clones
- dataset_name: com.github.kamu-data.stargazers
- dataset_name: com.github.kamu-data.views
- dataset_name: com.youtube.channel.kamu-data.stats
- dataset_name: com.youtube.channel.kamu-data.videos.stats
- dataset_name: com.youtube.playlist.featuring-kamu-data.videos.stats
runs-on: ubuntu-latest
steps:
- name: Checkout repo
uses: actions/checkout@v3
with:
path: kamu-contrib
- uses: oduwsdl/setup-ipfs@main
with:
ipfs_version: ^0.29
run_daemon: true
- name: Install kamu
shell: bash
run: |
curl -s "https://get.kamu.dev" | sh
- name: Print info
shell: bash
run: |
echo "Work dir: $PWD"
echo "IPFS version:"
ipfs version
ipfs id
echo "Podman version:"
podman version
echo "AWS CLI version:"
aws --version
echo "Kamu version:"
kamu --version
- uses: actions/cache@v4
with:
key: ingest-${{ matrix.dataset_name }}
save-always: true
path: |
.kamu
- uses: aws-actions/configure-aws-credentials@v4
with:
audience: sts.amazonaws.com
aws-region: us-west-2
role-to-assume: ${{ secrets.AWS_IAM_ROLE }}
- name: Configure
shell: bash
run: |
# Setup kamu config
echo "
kind: CLIConfig
version: 1
content:
source:
ethereum:
rpcEndpoints:
- chainId: 1
chainName: Ethereum Mainnet
nodeUrl: ${{ secrets.INFURA_ETH_MAINNET_WS }}
" > .kamuconfig
# Add pinning service
ipfs pin remote service add pinata https://api.pinata.cloud/psa "${{ secrets.PINATA_ACCESS_TOKEN }}"
# Create workspace
kamu init --exists-ok
kamu list --output-format table -w
- name: Sync dataset from S3
shell: bash
run: |
kamu -v pull "$AWS_S3_BASE_URL/${{ matrix.dataset_name }}" --as ${{ matrix.dataset_name }} --no-alias
kamu list --output-format table -w
- name: Ingest new data
shell: bash
env:
ALPHA_VANTAGE_API_KEY: ${{ secrets.ALPHA_VANTAGE_API_KEY }}
YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }}
GH_TOKEN: ${{ secrets.GITHUBSTATS_TOKEN }}
run: |
kamu -v pull --fetch-uncacheable ${{ matrix.dataset_name }}
kamu list --output-format table -w
- name: Publish dataset to S3
shell: bash
run: |
kamu push ${{ matrix.dataset_name }} --to "$AWS_S3_BASE_URL/${{ matrix.dataset_name }}" --no-alias
- name: Resolve current dataset CID
id: resolve-dataset
if: ${{ matrix.dnslink_subdomain }}
shell: bash
run: |
ipfs_cid=$( \
aws route53 list-resource-record-sets \
--hosted-zone-id $AWS_ROUTE53_HOSTED_ZONE_ID \
| jq -r ".ResourceRecordSets[] | select(.Name == \"${{ matrix.dnslink_subdomain }}.\") | .ResourceRecords[0].Value" \
| grep -oE '[^/]+"$' \
| grep -oE '^[^"]+'
)
echo "Current IPFS CID (from DNSLink): $ipfs_cid"
echo "CID=$ipfs_cid" >> $GITHUB_OUTPUT
- name: Pin dataset in Pinata
if: ${{ matrix.dnslink_subdomain }}
shell: bash
env:
OLD_CID: ${{ steps.resolve-dataset.outputs.CID }}
PIN_TRIES: "10"
run: |
# Add dataset to IPFS
ipfs_cid=$(kamu -v system ipfs add ${{ matrix.dataset_name }})
echo "New IPFS CID: $ipfs_cid"
if [ "$OLD_CID" == "$ipfs_cid" ]; then
echo "CID didn't change - skipping publishing"
else
for i in $(seq 1 $PIN_TRIES); do
echo "Pinning CID (attempt $i)"
if ipfs pin remote add --service=pinata --name=${{ matrix.dataset_name }} $ipfs_cid; then
echo "Successfully pinned"
break
else
echo "Pinning service returned an error"
# TODO: Remove this debug stuff when pinning is reliable
peers_num=$(ipfs swarm peers | wc -l)
echo "IPFS is connected to $peers_num peers"
echo "IPFS addresses:"
ipfs id
if curl -s --fail "https://dweb.link/ipfs/$ipfs_cid" > /dev/null; then
echo "CID is RESOLVABLE via public gateway"
else
echo "CID is NOT resolvable via public gateway"
fi
if [ $i == $PIN_TRIES ]; then
echo "Failed to pin CID after $PIN_TRIES attempts"
# TODO: Pinning fails so often that we have to ignore this for now
# not to obscure ingest errors.
exit 0
fi
fi
done
# Update DNSLink entry
echo "{
\"Comment\": \"GHAction update of DNSLink record\",
\"Changes\": [
{
\"Action\": \"UPSERT\",
\"ResourceRecordSet\": {
\"Name\": \"${{ matrix.dnslink_subdomain }}.\",
\"Type\": \"TXT\",
\"TTL\": $AWS_ROUTE53_TTL,
\"ResourceRecords\": [{
\"Value\": \"\\\"dnslink=/ipfs/$ipfs_cid\\\"\"
}]
}
}
]
}" > dns-upsert.json
echo "Updating DNSLink entry with: $(cat dns-upsert.json)"
aws route53 change-resource-record-sets --hosted-zone-id $AWS_ROUTE53_HOSTED_ZONE_ID --change-batch file://dns-upsert.json
fi
- name: Cleanup old pins
if: ${{ matrix.dnslink_subdomain }}
shell: bash
env:
PINATA_ACCESS_TOKEN: ${{ secrets.PINATA_ACCESS_TOKEN }}
run: |
kamu-contrib/.ci/pinata.py unpin-old --name ${{ matrix.dataset_name }} --keep-latest 1