Version 0.30 - Memory consumption explodes after a number of hours #10526

bgrahamen · 2024-09-26T18:03:37Z

Checklist

This is a bug report, not a question. Ask questions on discuss.ipfs.tech.
I have searched on the issue tracker for my bug.
I am running the latest kubo version or have an issue updating.

Installation method

ipfs-update or dist.ipfs.tech

Version

ipfs version 0.30.0

Config

{
  "API": {
    "HTTPHeaders": {
      "Access-Control-Allow-Credentials": [
        "true"
      ],
      "Access-Control-Allow-Methods": [
        "OPTIONS",
        "GET",
        "POST"
      ],
      "Access-Control-Allow-Origin": [
        "*"
      ]
    }
  },
  "Addresses": {
    "API": "/ip4/127.0.0.1/tcp/5001",
    "Announce": [
      "/ip4/<redacted>/tcp/4001",
      "/ip4/<redacted>/udp/4002/webrtc-direct",
      "/ip4/<redacted>/udp/4002/quic-v1",
      "/ip4/<redacted>/udp/4002/quic-v1/webtransport"
    ],
    "AppendAnnounce": null,
    "Gateway": "/ip4/0.0.0.0/tcp/8080",
    "NoAnnounce": [
      "/ip4/10.0.0.0/ipcidr/8",
      "/ip4/100.64.0.0/ipcidr/10",
      "/ip4/169.254.0.0/ipcidr/16",
      "/ip4/172.16.0.0/ipcidr/12",
      "/ip4/192.0.0.0/ipcidr/24",
      "/ip4/192.0.2.0/ipcidr/24",
      "/ip4/192.168.0.0/ipcidr/16",
      "/ip4/198.18.0.0/ipcidr/15",
      "/ip4/198.51.100.0/ipcidr/24",
      "/ip4/203.0.113.0/ipcidr/24",
      "/ip4/240.0.0.0/ipcidr/4",
      "/ip6/100::/ipcidr/64",
      "/ip6/2001:2::/ipcidr/48",
      "/ip6/2001:db8::/ipcidr/32",
      "/ip6/fc00::/ipcidr/7",
      "/ip6/fe80::/ipcidr/10"
    ],
    "Swarm": [
      "/ip4/0.0.0.0/tcp/4001",
      "/ip6/::/tcp/4001",
      "/ip4/0.0.0.0/udp/4002/webrtc-direct",
      "/ip4/0.0.0.0/udp/4002/quic-v1",
      "/ip4/0.0.0.0/udp/4002/quic-v1/webtransport",
      "/ip6/::/udp/4002/webrtc-direct",
      "/ip6/::/udp/4002/quic-v1",
      "/ip6/::/udp/4002/quic-v1/webtransport"
    ]
  },
  "AutoNAT": {},
  "Bootstrap": [
    "/dnsaddr/bootstrap.libp2p.io/p2p/QmNnooDu7bfjPFoTZYxMNLWUQJyrVwtbZg5gBMjTezGAJN",
    "/dnsaddr/bootstrap.libp2p.io/p2p/QmQCU2EcMqAqQPR2i9bChDtGNJchTbq5TbXJJ16u19uLTa",
    "/dnsaddr/bootstrap.libp2p.io/p2p/QmbLHAnMoJPWSCR5Zhtx6BHJX9KiKNN6tpvbUcqanj75Nb",
    "/dnsaddr/bootstrap.libp2p.io/p2p/QmcZf59bWwK5XFi76CZX8cbJ4BhTzzA3gU1ZjYZcYW3dwt",
    "/ip4/<redacted>/tcp/4001/p2p/QmaCpDMGvV2BGHeYERUEnRQAwe3N8SzbUtfsmvsqQLuvuJ"
  ],
  "DNS": {
    "Resolvers": null
  },
  "Datastore": {
    "BloomFilterSize": 1048576,
    "GCPeriod": "30m",
    "HashOnRead": false,
    "Spec": {
      "mounts": [
        {
          "child": {
            "path": "blocks",
            "shardFunc": "/repo/flatfs/shard/v1/next-to-last/2",
            "sync": true,
            "type": "flatfs"
          },
          "mountpoint": "/blocks",
          "prefix": "flatfs.datastore",
          "type": "measure"
        },
        {
          "child": {
            "compression": "none",
            "path": "datastore",
            "type": "levelds"
          },
          "mountpoint": "/",
          "prefix": "leveldb.datastore",
          "type": "measure"
        }
      ],
      "type": "mount"
    },
    "StorageGCWatermark": 70,
    "StorageMax": "1500GB"
  },
  "Discovery": {
    "MDNS": {
      "Enabled": true
    }
  },
  "Experimental": {
    "FilestoreEnabled": false,
    "GraphsyncEnabled": false,
    "Libp2pStreamMounting": false,
    "P2pHttpProxy": false,
    "ShardingEnabled": false,
    "StrategicProviding": false,
    "UrlstoreEnabled": false
  },
  "ExperimentalLibp2pStreamMounting": false,
  "Gateway": {
    "APICommands": [],
    "HTTPHeaders": {
      "Access-Control-Allow-Methods": [
        "GET",
        "HEAD"
      ],
      "Access-Control-Allow-Origin": [
        "*"
      ]
    },
    "NoDNSLink": false,
    "NoFetch": false,
    "PathPrefixes": [],
    "PublicGateways": null,
    "RootRedirect": "",
    "Writable": false
  },
  "Identity": {
    "PeerID": "<redacted>"
  },
  "Internal": {},
  "Ipns": {
    "RecordLifetime": "",
    "RepublishPeriod": "",
    "ResolveCacheSize": 500
  },
  "Libp2pStreamMounting": false,
  "Migration": {
    "DownloadSources": null,
    "Keep": ""
  },
  "Mounts": {
    "FuseAllowOther": false,
    "IPFS": "/ipfs",
    "IPNS": "/ipns"
  },
  "Peering": {
    "Peers": [
      {
        "Addrs": [
          "/ip4/<redacted>/tcp/4001",
          "/ip4/<redacted>/udp/4001/quic-v1",
          "/ip4/<redacted>/udp/4001/quic-v1/webtransport"
        ],
        "ID": "<redacted>"
      }
    ]
  },
  "Pinning": {},
  "Plugins": {
    "Plugins": null
  },
  "Provider": {
    "Strategy": ""
  },
  "Pubsub": {
    "DisableSigning": false,
    "Router": ""
  },
  "Reprovider": {
    "Strategy": "pinned"
  },
  "Routing": {
    "AcceleratedDHTClient": true,
    "Methods": null,
    "Routers": null
  },
  "Swarm": {
    "AddrFilters": null,
    "ConnMgr": {},
    "DisableBandwidthMetrics": false,
    "DisableNatPortMap": false,
    "EnableAutoRelay": false,
    "EnableRelayHop": false,
    "RelayClient": {
      "Enabled": false
    },
    "RelayService": {
      "Enabled": true
    },
    "ResourceMgr": {
      "Enabled": false
    },
    "Transports": {
      "Multiplexers": {},
      "Network": {},
      "Security": {}
    }
  }
}

Description

I recently upgraded from 0.29 to 0.30 to address a memory leak I was experiencing.

Everything seems to be going fine for the first 20 or so hours but then there was an explosion of memory consumption around the 21st hours. After a restart of the processes the behavior is the same though the time windows between stable memory consumption to explosion is not predictable.

When the process first starts up it consumes approximation 27GB of memory. While it is in "steady state" the memory consumption grows between 3GB and 7GB, at which point there is something that gives way and the process shoots to 197GB of memory and continues to grow until is it OOM killed. The box this is running on has 256GB of ram total.

Attached are the allocs, cpu, goroutines, and heap pprof files:
heap.tar.gz
goroutines.tar.gz
cpu.tar.gz
alloc.tar.gz

goroutine stacks file:
goroutines-stacks.tar.gz

Graph is behavior (memory utilization over 24 hour period after a process restart):

The text was updated successfully, but these errors were encountered:

lidel · 2024-09-27T17:16:44Z

@bgrahamen thank you for the report, we will triage next week, but a quick ask:
do you remember if this behavior started in 0.29, or was it also present in 0.28?

0.28 started prioritizing pin roots (release notes) and that introduced slight memory increase when Routing.Strategy is set to pinned or all, which first announces pin roots, and then announces the rest, while skipping CIDs that were already announced.

Would it be possible for you to switch Routing.Strategy to flat (which is equivalent to all from 0.27), restart and wait 24h and then report back it it had any impact on memory?
This will be very helpful in triaging and in narrowing down the cause of the behavior you experience.

bgrahamen · 2024-09-27T17:32:10Z

This behavior was present in 0.29 but much much worse in regards to speed of consumption and subsequent OOM kill + restart of the service.

For testing purposes I will see if I can change Reprovider.Strategy to flat for a time to check memory utilization changes. Will report back once that happens and I have some data.

That being said we are not setting Routing.Strategy we are setting Reprovider.Strategy which is what I think you meant @lidel?

aschmahmann · 2024-09-27T17:55:30Z

Otherwise our bandwidth utilization goes through the roof since we will route any CID, not just for the stuff we have pinned, I think.

That's not quite right. To make sure we're on the same page there are a few subsystems within kubo that may be relevant here:

Blockstore -> this is place that all block data (i.e. mapping of multihash -> bytes) is found
Garbage Collection -> this allows removal from the blockstore of any data there that is no longer "needed"
- Pinning -> this marks certain data as "please keep me around", between pinning and MFS that covers all data that's protected from the garbage collector (if you're running it)
Providing -> this tells the network (e.g. it tells the Amino DHT) you have data so other nodes looking for that data can go to you and ask for it. Of course if they find you they can ask for it anyway
HTTP Gateway API -> answers queries for data and returns them according to the specification https://specs.ipfs.tech/http-gateways/
- By default if you ask the gateway API for data and it doesn't have it it will look for it on the network and return it, however you can use the https://github.com/ipfs/kubo/blob/master/docs/config.md#gatewaynofetch option to not reach out to the network for requests.

So by advertising unpinned data all you're doing is telling the network that you have data in your blockstore that you might garbage collect in the future, so the only way you're fetching "any CID" is if you're already fetching "any CID" for people via them using something like a publicly exposed HTTP Gateway API.

bgrahamen · 2024-09-27T17:58:56Z

the only way you're fetching "any CID" is if you're already fetching "any CID" for people via them using something like a publicly exposed HTTP Gateway API.

This is exactly what happens in our setup today. We also intend on setting Gateway.NoFetch as well.

But I need to resolve this memory consumption issue first as that is presently a blocker.

rotarur · 2024-09-30T07:55:42Z

I've switched the Routing.Strategy to flat and will update you tomorrow on memory usage

bgrahamen · 2024-09-30T17:11:26Z

I would say, given how the memory consumption/growth looks so far. That setting flat for the Reprovider strategy made things much worse. The blue line is our alert limit of 126GB, the process is OOM killed around 212-225 GB.

Before Change (24 hr period):

After change (1 hr):

Givent he rate of memory consumption with flat in place things look like they will OOM kill i about 1/4 of the time.

aschmahmann · 2024-09-30T17:15:28Z

Could you post the profile information with the config changed as well?

bgrahamen · 2024-09-30T17:26:12Z

Could you post the profile information with the config changed as well?

Trying to do that now.

bgrahamen · 2024-09-30T17:35:44Z

We can't capture a pprof when Reprovider.Strategy is set to flat the process simply consumes memory till it gets OOM killed. I can't get a successful ipfs diag profile to run at all without the process dying. Even when I start it with only 20GB of memory used.

aschmahmann · 2024-09-30T17:45:06Z

Maybe you can grab a smaller slice of the info, like the just the heap and memstats https://github.com/ipfs/kubo/blob/master/docs/debug-guide.md

bgrahamen · 2024-09-30T18:02:40Z

latest_capture_with_flat_set.tar.gz

Attached are the outputs from these commands with flat set

goroutine dump:
curl localhost:5001/debug/pprof/goroutine\?debug=2 > ipfs.stacks

30 second cpu profile:
curl localhost:5001/debug/pprof/profile > ipfs.cpuprof

heap trace dump:
curl localhost:5001/debug/pprof/heap > ipfs.heap

memory statistics (in json, see "memstats" object):
curl localhost:5001/debug/vars > ipfs.vars

system information:
ipfs diag sys > ipfs.sysinfo

bgrahamen · 2024-09-30T18:10:34Z

Another interesting note. After this last OOM kill restart the memory consumption has now remained stable. Though req/s, throughput, and all other traffic metrics are effectively the same between periods of rapid memory consumption and OOM kill; and periods like now where it is steady at 30GB of memory used and slowly growing (like before we set flat)

lidel · 2024-10-01T16:51:30Z

Triage notes

kubo gateway staging from ipshipyard has Reprovider disabled (Interval set to 0) and it OOMs as well
- observation: Rainbow does not seem to OOM because it runs with separate DHT node which gets shut down before problems occur (dht-shared-host=false)
quic-go seems to dominate pprof view
- seems go-libp2p is bit behind with latest fixes in quic-go quic-go/quic-go@v0.45.2...v0.47.0
- perhaps disable Swarm.Transports.Network.QUIC to confirm?

bgrahamen · 2024-10-01T19:00:26Z

Given QUIC is the most popular transport I wonder if we would be able to turn it off without impacting our services.

It looks like go-libp2p might pull in quic-go 0.47 in the next update (via go get) -- libp2p/go-libp2p#2901 .

@lidel I compiled kubo against a local copy of go-libp2p v0.36.4 that was updated to use quic-go v0.47.0. If it would be helpful for your investigation I can provide that bin here, or push my modified docker image up to a repo for you to grab and test.

It seems to be running so far on my local setup, but my local setup is pretty much a test to see if it will even start.

Docker image with kubo compiled against go-libp2p v0.36.4 with quic-go updated to v0.47 -- https://hub.docker.com/repository/docker/bgrahamen/ipfs-mod

bgrahamen · 2024-10-03T17:27:15Z

Quick update:

We will be testing the following changes to our config in an effort to see if we can resolve the issue:

ipfs config --json Swarm.DisableBandwidthMetrics true
ipfs config --json Swarm.ResourceMgr.Enabled true
ipfs config --json Swarm.ConnMgr.HighWater 4000
ipfs config --json Swarm.ConnMgr.LowWater 1000
ipfs config --json Swarm.ConnMgr.GracePeriod "30s"
ipfs config --json Swarm.ResourceMgr.MaxMemory "8589934592"
ipfs config --json Reprovider.Strategy "pinned"
ipfs config --json Gateway.NoFetch true

We are attempting to avoid having to turn off QUIC as a transport, or run the version compiled against an updated libp2p. The theory being is that if the issue is QUIC connections hanging around for too long, maybe reaping them and putting a limit on memory used by libp2p might help.

These changes (with lower limits) are running on our test setup now and we are monitoring metrics for 24hrs before deploying them to our production setup.

Test Setup Limits:
ipfs config --json Swarm.ConnMgr.HighWater 1000
ipfs config --json Swarm.ConnMgr.LowWater 200
ipfs config --json Swarm.ResourceMgr.MaxMemory "943718400"

bgrahamen · 2024-10-03T21:00:38Z

Additional side note. With the testnet configuration in place I am seeing this warning on startup:

ERROR fullrtdht fullrt/dht.go:328 Accelerated DHT client was unable to fully refresh its routing table due to Resource Manager limits, which may degrade content routing. Consider increasing resource limits. See debug logs for the "dht-crawler" subsystem for details

I assume (per this issue) -- https://github.com/ipfs/kubo/issues/9405 -- that it is related to the number of connections I am setting in the high/low water mark config for ResourceManager.

Is there any "good known" values for those high/low water marks that don't interfere with Accelerated-DHT?

bgrahamen · 2024-10-08T14:03:36Z

Update: We have made is just about 4.5 days since we put these changes in place and the process is still running. Memory is still growing slowly over time but we have yet to see the run away memory consumption effect. It will be interesting to see what happens when is reaches the memory limit (120GB).

If it is anything like out test setup it hopefully will reclaim a bunch of memory when it nears that limit and then work its way back up to the limit again. This remains to be seen but at present we are hopeful this occurs.

lidel · 2024-10-08T16:26:29Z

Thank you for testing quic-go v0.47.0 @bgrahamen.
Do you feel it translated to decreased memory utilization, or did you change other settings?
Would be good to see how things behave in 48h+ window.

Sidenotes:

Swarm.ResourceMgr.MaxMemory "943718400" (900MiB) feels awfully too low (see minimal reqs),
- iiuc by default Kubo will set it to ~50% of your RAM, and that is mostly to not clog desktop users.
- setting it to extremely low ceiling (900MiB) is why you see "Accelerated DHT client was unable to fully refresh its routing table due to Resource Manager limits" error.
- at our infra, we set it to 85% of available RAM to utilize our boxes more fully, you may wan to do something similar.
Swarm.ConnMgr (soft limits) aim to GC connections before Swarm.ResourceMgr (hard limits) are hit.
- Are you increasing soft ones for any reason? The current default is 32/96/20s, and likely you don't need to change it. Raising these will only increase your memory use.
- Consider using defaults. You should run accelerated client just fine with these defaults.
  - Caveat: if you act as a stable DHT server (Routing.Type being auto + your node having public addr) you will be around 1k of connections anyway because of other peers connecting to you. You may set Routing.Type to autoclient to really cut down on incoming connections.
Noticed you run with flatfs and "sync": true – you may switch to false if you have many writes (avoiding blocking on flush after every write)

bgrahamen · 2024-10-08T17:16:16Z

@lidel -- On our test setup we have it set that low mostly to keep things as small as possible from an instance size/cost prospective. I will check to see if we can move things up to allow it at least 4GB or so of ram.

I also not sure about the history of connection limits we have set. i agree they are very high but I haven't figured out why yet to be able to adjust them down with confidence.

The routing change I will also look into as well.

We have not tested the compiled version with 0.47 of quic-go yet. We are hoping to avoid doing that with settings changes until it can be pulled into libp2p officially. However, if needed we will certainly try it.

lidel · 2024-10-15T16:08:24Z

Triage note: parking until Kubo 0.32.0-rc1 with latest quic-go is ready

bgrahamen · 2024-10-18T16:06:56Z

Interesting additional data perhaps. Our changes, we have not touched connection limits yet, has brought a very marked increase in stability. However, we have noticed something else crop up, perhaps hidden before due to the all the issues with memory exhaustion before.

I am not sure how the interaction plays out in kubo, but HTTP API requests also seem to increase memory as well?

We received a large spike of HTTP requests to our production IPFS in to two examples provided below. In both cases you can see the kubo process explode in terms of memory utilization. In the first example it died several times while under high memory pressure. I am still pulling logs here to see if it was OOM kill or something else. The second time it was able to recover, and the slowly start growing again (As it seems to do in our case).

Top is HTTP requests sum (per 15 min)/Bottom is kubo memory utilization.

lidel · 2024-10-23T18:10:49Z

@bgrahamen spikes in gateway traffic likely translate to increased libp2p/bitswap/quic activity, so nothing unexpected (but additional data point that more QUIC == more memory).

We've just shipped go-libp2p 0.37 with quic-go v0.48.1 and it got merged into Kubo's master branch.
Give it a try (either build yourself, or use ipfs/kubo:master-2024-10-22-56c68a1 docker image) and let us know if it had any meaningful impact.

gammazero · 2024-11-25T16:43:01Z

Triage note: Profile in staging environment to see if this is quic, or something else. If this is quic then open libp2p issue if there is not one already.

bgrahamen · 2024-12-04T20:43:15Z

We have upgraded to v.32.1 and are monitoring memory consumption to see how the changed pulling in with the updated libp2p might impacting things (hopefully in a positive way) for us.

lidel · 2024-12-06T16:24:43Z

We've confirmed it is still a problem in both v0.32.1 and latest master (433444b) which uses go-libp2p v0.37.2.

Team is still looking into this on our staging environment, seems to be related to go-libp2p UDP transports not releasing memory correctly.

Hopefully we will have an upstream fix soon.

bgrahamen added kind/bug A bug in existing code (including security flaws) need/triage Needs initial labeling and prioritization labels Sep 26, 2024

lidel added the need/author-input Needs input from the original author label Sep 27, 2024

lidel added the status/blocked Unable to be worked further until needs are met label Oct 15, 2024

gammazero added need/maintainers-input Needs input from the current maintainer(s) and removed need/author-input Needs input from the original author status/blocked Unable to be worked further until needs are met labels Nov 25, 2024

gammazero self-assigned this Nov 25, 2024

gammazero mentioned this issue Nov 25, 2024

IPFS has multiple memory leaks #10549

Closed

3 tasks

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Version 0.30 - Memory consumption explodes after a number of hours #10526

Version 0.30 - Memory consumption explodes after a number of hours #10526

bgrahamen commented Sep 26, 2024 •

edited

Loading

lidel commented Sep 27, 2024

bgrahamen commented Sep 27, 2024 •

edited

Loading

aschmahmann commented Sep 27, 2024

bgrahamen commented Sep 27, 2024 •

edited

Loading

rotarur commented Sep 30, 2024

bgrahamen commented Sep 30, 2024 •

edited

Loading

aschmahmann commented Sep 30, 2024

bgrahamen commented Sep 30, 2024

bgrahamen commented Sep 30, 2024

aschmahmann commented Sep 30, 2024

bgrahamen commented Sep 30, 2024 •

edited

Loading

bgrahamen commented Sep 30, 2024

lidel commented Oct 1, 2024 •

edited

Loading

bgrahamen commented Oct 1, 2024 •

edited

Loading

bgrahamen commented Oct 3, 2024 •

edited

Loading

bgrahamen commented Oct 3, 2024

bgrahamen commented Oct 8, 2024

lidel commented Oct 8, 2024

bgrahamen commented Oct 8, 2024

lidel commented Oct 15, 2024

bgrahamen commented Oct 18, 2024

lidel commented Oct 23, 2024

gammazero commented Nov 25, 2024

bgrahamen commented Dec 4, 2024

lidel commented Dec 6, 2024

Version 0.30 - Memory consumption explodes after a number of hours #10526

Version 0.30 - Memory consumption explodes after a number of hours #10526

Comments

bgrahamen commented Sep 26, 2024 • edited Loading

Checklist

Installation method

Version

Config

Description

lidel commented Sep 27, 2024

bgrahamen commented Sep 27, 2024 • edited Loading

aschmahmann commented Sep 27, 2024

bgrahamen commented Sep 27, 2024 • edited Loading

rotarur commented Sep 30, 2024

bgrahamen commented Sep 30, 2024 • edited Loading

aschmahmann commented Sep 30, 2024

bgrahamen commented Sep 30, 2024

bgrahamen commented Sep 30, 2024

aschmahmann commented Sep 30, 2024

bgrahamen commented Sep 30, 2024 • edited Loading

bgrahamen commented Sep 30, 2024

lidel commented Oct 1, 2024 • edited Loading

bgrahamen commented Oct 1, 2024 • edited Loading

bgrahamen commented Oct 3, 2024 • edited Loading

bgrahamen commented Oct 3, 2024

bgrahamen commented Oct 8, 2024

lidel commented Oct 8, 2024

bgrahamen commented Oct 8, 2024

lidel commented Oct 15, 2024

bgrahamen commented Oct 18, 2024

lidel commented Oct 23, 2024

gammazero commented Nov 25, 2024

bgrahamen commented Dec 4, 2024

lidel commented Dec 6, 2024

bgrahamen commented Sep 26, 2024 •

edited

Loading

bgrahamen commented Sep 27, 2024 •

edited

Loading

bgrahamen commented Sep 27, 2024 •

edited

Loading

bgrahamen commented Sep 30, 2024 •

edited

Loading

bgrahamen commented Sep 30, 2024 •

edited

Loading

lidel commented Oct 1, 2024 •

edited

Loading

bgrahamen commented Oct 1, 2024 •

edited

Loading

bgrahamen commented Oct 3, 2024 •

edited

Loading