Kubo stopped listing pin after some times #10596

Mayeu · 2024-11-25T10:33:19Z

Checklist

This is a bug report, not a question. Ask questions on discuss.ipfs.tech.
I have searched on the issue tracker for my bug.
I am running the latest kubo version or have an issue updating.

Installation method

built from source

Version

Kubo version: 0.32.1
Repo version: 16
System version: amd64/linux
Golang version: go1.23.3

Config

{
  "API": {
    "HTTPHeaders": {}
  },
  "Addresses": {
    "API": "/ip4/127.0.0.1/tcp/5001",
    "Announce": [],
    "AppendAnnounce": [],
    "Gateway": "/ip4/127.0.0.1/tcp/8080",
    "NoAnnounce": [
      "/ip4/10.0.0.0/ipcidr/8",
      "/ip4/100.64.0.0/ipcidr/10",
      "/ip4/169.254.0.0/ipcidr/16",
      "/ip4/172.16.0.0/ipcidr/12",
      "/ip4/192.0.0.0/ipcidr/24",
      "/ip4/192.0.2.0/ipcidr/24",
      "/ip4/192.168.0.0/ipcidr/16",
      "/ip4/198.18.0.0/ipcidr/15",
      "/ip4/198.51.100.0/ipcidr/24",
      "/ip4/203.0.113.0/ipcidr/24",
      "/ip4/240.0.0.0/ipcidr/4",
      "/ip6/100::/ipcidr/64",
      "/ip6/2001:2::/ipcidr/48",
      "/ip6/2001:db8::/ipcidr/32",
      "/ip6/fc00::/ipcidr/7",
      "/ip6/fe80::/ipcidr/10"
    ],
    "Swarm": [
      "/ip4/0.0.0.0/tcp/4001",
      "/ip6/::/tcp/4001",
      "/ip4/0.0.0.0/udp/4001/quic-v1",
      "/ip4/0.0.0.0/udp/4001/quic-v1/webtransport",
      "/ip4/0.0.0.0/udp/4001/webrtc-direct",
      "/ip6/::/udp/4001/quic-v1",
      "/ip6/::/udp/4001/quic-v1/webtransport",
      "/ip6/::/udp/4001/webrtc-direct"
    ]
  },
  "AutoNAT": {},
  "AutoTLS": {},
  "Bootstrap": [
    "/dnsaddr/bootstrap.libp2p.io/p2p/QmNnooDu7bfjPFoTZYxMNLWUQJyrVwtbZg5gBMjTezGAJN",
    "/dnsaddr/bootstrap.libp2p.io/p2p/QmQCU2EcMqAqQPR2i9bChDtGNJchTbq5TbXJJ16u19uLTa",
    "/dnsaddr/bootstrap.libp2p.io/p2p/QmbLHAnMoJPWSCR5Zhtx6BHJX9KiKNN6tpvbUcqanj75Nb",
    "/dnsaddr/bootstrap.libp2p.io/p2p/QmcZf59bWwK5XFi76CZX8cbJ4BhTzzA3gU1ZjYZcYW3dwt",
    "/ip4/104.131.131.82/tcp/4001/p2p/QmaCpDMGvV2BGHeYERUEnRQAwe3N8SzbUtfsmvsqQLuvuJ",
    "/ip4/104.131.131.82/udp/4001/quic-v1/p2p/QmaCpDMGvV2BGHeYERUEnRQAwe3N8SzbUtfsmvsqQLuvuJ"
  ],
  "DNS": {
    "Resolvers": {}
  },
  "Datastore": {
    "BloomFilterSize": 0,
    "GCPeriod": "1h",
    "HashOnRead": false,
    "Spec": {
      "mounts": [
        {
          "child": {
            "path": "blocks",
            "shardFunc": "/repo/flatfs/shard/v1/next-to-last/3",
            "sync": false,
            "type": "flatfs"
          },
          "mountpoint": "/blocks",
          "prefix": "flatfs.datastore",
          "type": "measure"
        },
        {
          "child": {
            "compression": "none",
            "path": "datastore",
            "type": "levelds"
          },
          "mountpoint": "/",
          "prefix": "leveldb.datastore",
          "type": "measure"
        }
      ],
      "type": "mount"
    },
    "StorageGCWatermark": 90,
    "StorageMax": "31TB"
  },
  "Discovery": {
    "MDNS": {
      "Enabled": false
    }
  },
  "Experimental": {
    "FilestoreEnabled": false,
    "GraphsyncEnabled": false,
    "Libp2pStreamMounting": false,
    "OptimisticProvide": false,
    "OptimisticProvideJobsPoolSize": 0,
    "P2pHttpProxy": false,
    "StrategicProviding": false,
    "UrlstoreEnabled": false
  },
  "Gateway": {
    "APICommands": [],
    "DeserializedResponses": null,
    "DisableHTMLErrors": null,
    "ExposeRoutingAPI": null,
    "HTTPHeaders": {},
    "NoDNSLink": false,
    "NoFetch": false,
    "PathPrefixes": [],
    "PublicGateways": {
      "cfg": {
        "DeserializedResponses": null,
        "InlineDNSLink": null,
        "NoDNSLink": false,
        "Paths": null,
        "UseSubdomains": false
      },
      "localhost": {
        "DeserializedResponses": null,
        "InlineDNSLink": null,
        "NoDNSLink": false,
        "Paths": [
          "/ipfs"
        ],
        "UseSubdomains": false
      }
    },
    "RootRedirect": ""
  },
  "Identity": {
    "PeerID": "12D3KooWAJJJwXsB5b68cbq69KpXiKqQAgTKssg76heHkg6mo2qB"
  },
  "Import": {
    "CidVersion": null,
    "HashFunction": null,
    "UnixFSChunker": null,
    "UnixFSRawLeaves": null
  },
  "Internal": {
    "Bitswap": {
      "EngineBlockstoreWorkerCount": 128,
      "EngineTaskWorkerCount": 8,
      "MaxOutstandingBytesPerPeer": null,
      "ProviderSearchDelay": null,
      "TaskWorkerCount": 8
    }
  },
  "Ipns": {
    "RecordLifetime": "",
    "RepublishPeriod": "",
    "ResolveCacheSize": 128
  },
  "Migration": {
    "DownloadSources": [],
    "Keep": ""
  },
  "Mounts": {
    "FuseAllowOther": false,
    "IPFS": "/ipfs",
    "IPNS": "/ipns"
  },
  "Peering": {
    "Peers": "removed for brevity"
  },
  "Pinning": {},
  "Plugins": {
    "Plugins": null
  },
  "Provider": {
    "Strategy": ""
  },
  "Pubsub": {
    "DisableSigning": false,
    "Router": ""
  },
  "Reprovider": {
    "Interval": "24h0m0s",
    "Strategy": "all"
  },
  "Routing": {
    "AcceleratedDHTClient": true,
    "Methods": null,
    "Routers": null,
    "Type": "auto"
  },
  "Swarm": {
    "AddrFilters": [
      "/ip4/10.0.0.0/ipcidr/8",
      "/ip4/100.64.0.0/ipcidr/10",
      "/ip4/169.254.0.0/ipcidr/16",
      "/ip4/172.16.0.0/ipcidr/12",
      "/ip4/192.0.0.0/ipcidr/24",
      "/ip4/192.0.2.0/ipcidr/24",
      "/ip4/192.168.0.0/ipcidr/16",
      "/ip4/198.18.0.0/ipcidr/15",
      "/ip4/198.51.100.0/ipcidr/24",
      "/ip4/203.0.113.0/ipcidr/24",
      "/ip4/240.0.0.0/ipcidr/4",
      "/ip6/100::/ipcidr/64",
      "/ip6/2001:2::/ipcidr/48",
      "/ip6/2001:db8::/ipcidr/32",
      "/ip6/fc00::/ipcidr/7",
      "/ip6/fe80::/ipcidr/10"
    ],
    "ConnMgr": {
      "GracePeriod": "30s",
      "HighWater": 4096,
      "LowWater": 1024
    },
    "DisableBandwidthMetrics": false,
    "DisableNatPortMap": true,
    "RelayClient": {
      "Enabled": false
    },
    "RelayService": {
      "Enabled": false
    },
    "ResourceMgr": {
      "Limits": {},
      "MaxMemory": "24GB"
    },
    "Transports": {
      "Multiplexers": {},
      "Network": {},
      "Security": {}
    }
  },
  "Version": {}
}

Description

Hello,

We started to experience an issue with Kubo in our 2-node cluster where Kubo don't list pin anymore.

We have 2 nodes that both pin all the pinset we keep track of, which is around 16.39 million pins right now.

Last weeks (while we were still using 0.29), Kubo stopped responding to the /pin/ls queries sent by the cluster, those requests were hanging "indefinitely" (as in, when using curl I stopped the command after ~16h without response). Our ipfs-cluster process is returning the following in the log when this happens:

Nov 24 22:16:34 ipfs-01 ipfs-cluster-service[3875697]: 2024-11-24T22:16:34.328Z        ERROR        pintracker        stateless/stateless.go:540        Post "http://127.0.0.1:5001/api/v0/pin/ls?stream=true&arg=QmcPpbdw8k8fyhas77WjzPMqSAgVYPonbVve3xTPtxL8ab&type=recursive": context canceled
Nov 24 22:16:34 ipfs-01 ipfs-cluster-service[3875697]: 2024-11-24T22:16:34.328Z        ERROR        cluster        ipfs-cluster/cluster.go:2022        12D3KooTheOtherClusterNodeHash: error in broadcast response from 12D3KooTheOtherClusterNodeHash: context canceled
Nov 24 22:16:34 ipfs-01 ipfs-cluster-service[3875697]: 2024-11-24T22:16:34.447Z        ERROR        ipfshttp        ipfshttp/ipfshttp.go:1276        error posting to IPFS:Post "http://127.0.0.1:5001/api/v0/pin/ls?stream=true&arg=QmRSGNoqrbTsKCgHbi8xCeQEn4sQXFDjfNEUgXWG5wAg6U&type=recursive": context canceled

This started out of the blue, there was no change on the server. The issue remained after upgrading to 0.32.1.

At that time, we had the bloom filter activated, deactivating it did improve the situation for a while (maybe 24h), and then the issue started to show up again. In retrospect, I think it may not be related to the bloom filter at all).

This is the typical metrics reported by ipfs-cluster which show when Kubo stop responding to /pin/ls:

The graph on top is the number of pins the cluster is keeping track of, and on the one on the bottom is the number of pins reported by Kubo. When restarting Kubo it generally jumps to the expected amount, and after a while it drops to 0. At that point any attempt to list pin from Kubo fails.

We only have the metrics reported by ipfs-cluster because of this Kubo bug.

The server CPU, RAM, and disk utilization is fairly low when this issue show up, so it doesn't look like it a performance issue. The only metric that started to go out of bound is the number of open file descriptors which grow and reached the 128k limit set. I bumped it to 1.28 million, but it still reaches it (with or without the bloom filter):

The FDs limit is set both at the systemd unit level, and via IPFS_FD_MAX.

Restarting Kubo make it work again most of the time, but sometimes it doesn't change anything and it instantly starts to fail.

Here is some profiling data from one of our nodes:

when it's starting and respond to pin listing: https://drive.google.com/file/d/1-HagskynT6D-WoZ_JxIvHiu8kZGS1EUd/view?usp=sharing
when it's in this fail state: https://drive.google.com/file/d/1fexP-ZBLxkvcCVtZTHPcIUNalzJsfemh/view?usp=sharing

More info about the system:

Nixos, current version of Nixpkgs being the PR that updated Kubo to 0.32.1
AMD Ryzen 5 Pro 3600 - 6c/12t - 3.6 GHz/4.2 GHz
128 GB ECC 2666 MHz
2×512 GB SSD NVMe
- one with the system
- the other is split and is used as logs and cache for ZFS
one ZFS ZRAID-0 pool with 4×6 TB HDD SATA

Kubo also emit a lot of:

INFO        net/identify        identify/id.go:457        failed negotiate identify protocol with peer        {"peer": "12D3KooWMyK8arvRjtC33rxTRZfKDcyQgZTC9yWpfMFHRbp
ngXwK", "error": "Application error 0x1 (remote): conn-31160279: system: cannot reserve inbound connection: resource limit exceeded"}

But ipfs swarm resources doesn't return anything above 5-15%, so I think this error is actually on the remote node side and not related to our issue, right?

Anything else we could gather to help solve this issue?

Right now I'm out of ideas to get our cluster back into a working state (beside restarting Kubo every 2h but that's not a solution since it will prevent us from reproving the pins to the rest of the network)

Edit with additional info:

Kubo is launched without the --enable-gc flag, as prescribed by ipfs-cluster doc.

The text was updated successfully, but these errors were encountered:

11qu1d · 2024-11-25T15:17:29Z

I can confirm we are seeing the same behaviour while testing ipfs/kubo:v0.32.1 docker image in k8s paired with ipfs-cluster (smaller test cluster ~17k pins).

Logs from ipfs-cluster:

2024-11-25T15:01:17.201Z	ERROR	ipfshttp	ipfshttp/ipfshttp.go:1276	error posting to IPFS:Post "http://localhost:5001/api/v0/pin/ls?stream=true&type=recursive": context deadline exceeded
2024-11-25T15:01:17.201Z	ERROR	pintracker	stateless/stateless.go:364	could not get pinset from IPFS: Post "http://localhost:5001/api/v0/pin/ls?stream=true&type=recursive": context deadline exceeded
2024-11-25T15:01:17.201Z	ERROR	pintracker	stateless/stateless.go:570	could not get pinset from IPFS: Post "http://localhost:5001/api/v0/pin/ls?stream=true&type=recursive": context deadline exceeded

No errors or other logs observed in Kubo.

Also, calling the API endpoint or running ipfs pin ls --stream in Kubo just hangs until the pod is restarted. After a while it stops working again.

hsanjuan · 2024-12-10T17:00:21Z

Hello, per the details you provide, I just think leveldb exploded in some way. How many files are there in the .ipfs/leveldb folder? Chances are, it is compacting... or was compacting and it was stopped.

Do you have monitoring for disk-reads? Is it trying to read/write a lot from disk even when nothing is being added?

I would recommend to switch leveldb to pebble (so, flatfs + pebble). You will loose the pinset (not the blocks, just the list of things you have pinned) but cluster will add the pins again, in time.

Mayeu · 2024-12-12T11:37:49Z

How many files are there in the .ipfs/leveldb folder?

$ ls -l /ipfs-data/node/datastore/*.ldb | wc -l
1917

Do you have monitoring for disk-reads? Is it trying to read/write a lot from disk even when nothing is being added?

We are running Netdata on the node so if you know of a particular metric that could be interesting I can take a look specifically at those.

In the meantime, this is the reported number of pins for the Kubo node between the 5th and 9th of December:

Disk bandwidth at that time:

and Disk utilization per systemd service:

So there is less data read when Kubo is stuck, but the total number of reads & writes operations don't seem to be impacted.

I would recommend to switch leveldb to pebble (so, flatfs + pebble). You will loose the pinset (not the blocks, just the list of things you have pinned) but cluster will add the pins again, in time.

I was thinking of that since I saw that pebble was added to Kubo in a previous version.

Just to confirm the process, that would go like so:

Stop Kubo
Update the datastore configuration to replace the levelds store type by a pebble store
delete the previous datastore data
Restart the node
ipfs-cluster will then ask Kubo to repin everything, which won't download anything since the data is still on disk.

hsanjuan · 2024-12-12T19:52:31Z

Yes, when switching you will need to edit datastore_spec. You can also set pebble to live in a different folder (not the datastore folder used by leveldb. I would keep that folder around just in case you need to go back.

Do you use MFS for anything?

Also, regarding the ipfs-pins graph, it goes to 0 because of ipfs-cluster/ipfs-cluster#2122. From now on it will stay at the last reported amount when pin ls worked..

Even if it won't need to download data, it will need to add 16M items to the datastore, and pinning will make it traverse everything it has.

Mayeu · 2024-12-13T09:43:46Z

Yes, when switching you will need to edit datastore_spec. You can also set pebble to live in a different folder (not the datastore folder used by leveldb. I would keep that folder around just in case you need to go back.

Thank you, I'm going to try that today on one of our node.

Do you use MFS for anything?

No we don't use MFS, we only add new pin via the cluster API, and when needed we access our data via Kubo's gateway using the CIDs. As far as I understand this doesn't involve the MFS subsystems.

[...] the ipfs-pins graph, it goes to 0 because of ipfs-cluster/ipfs-cluster#2122. [...]

Good to know, thank you 👍

Mayeu · 2024-12-13T13:38:15Z

I have switch one of our node to using the pebble datastore, right now it is slowly adding back the whole pinset to pebble.

ehsan6sha · 2024-12-15T17:16:52Z

pebble

hi
did changing it to pebble solved the issue?

Mayeu · 2024-12-18T16:52:17Z

@ehsan6sha still too soon to tell. Our node is still adding the data back into the pebble store. It has only catched up 50% of the previous pins right now.

github-actions · 2024-12-25T00:15:26Z

Oops, seems like we needed more information for this issue, please comment with more details or this issue will be closed in 7 days.

lidel · 2024-12-25T14:35:07Z

@Mayeu how does it look like now? (assuming it finished re-pinning)

github-actions · 2025-01-01T00:17:31Z

Oops, seems like we needed more information for this issue, please comment with more details or this issue will be closed in 7 days.

Mayeu · 2025-01-06T12:46:00Z

Hello, I'm just getting back to work after my end of the year vacation, so first, happy new year to you all :)

On the matter, it seems that something unexpected happened during my break:

Around the 22nd of December, the pinning stopped at ~14.18M (on 16.51M at that point) and then dropped to 0 until yesterday when it spiked back to the previous point and continue to pin. Right now there is still 0.7M pin that needs to be added to the DB.

I'm currently going through our logs and I'll report back with what I can find there.

Mayeu · 2025-01-07T08:48:39Z

Sadly, I can't find anything because the amount log produce meant that the log before the spike up was purged 🤦🏻‍♀️

I'm updating our log retention config for that machine to keep much (much) more logs, and "hope" to see that drop again.

On the bright side, this node is now fully caught up with the cluster state, so we'll see if this issue shows up again. But since the 4th of December, our first node (which is still using LevelDB) didn't experience that issue.

hsanjuan · 2025-01-07T13:07:03Z

As mentioned, cannot trust the graph much due to the bug I pointed above... you are better tracking the "pending" items (queued, pinning, error) and comparing that to total items in cluster pinset, rather than using this metric right now.

hsanjuan · 2025-01-07T13:07:27Z

(and happy new year!)

gammazero · 2025-01-07T18:18:20Z

@Mayeu We will wait for another week, and assume the issue is resolved if we do not hear from you.

Mayeu · 2025-01-08T11:01:42Z

As mentioned, cannot trust the graph much due to the bug I pointed above... you are better tracking the "pending" items (queued, pinning, error) and comparing that to total items in cluster pinset, rather than using this metric right now.

@hsanjuan right, I forgot about that. We do gather those as well.

Here they are for the past 60 days:

Pin queued:

Error:

Pinning:

Just a reminder of the timeline:

We realized that we were encountering the issue around the 15th of November, and started to change the node configurations and thus restart the nodes regularly (one of those being switching the number of concurrent pin from 100 to 50).
I switched our second node to using the pebble datastore on the 13th of December.

For comparison, here are the graphs for our first node (still using LevelDBs), which didn't experience as many issues (it still encountered that listing issue, but for some reason it stabilized pretty quickly after we realized there was an issue):

Error and Pinning:

Queued:

hsanjuan · 2025-01-08T13:10:57Z

So my understanding is that the new node still hangs on pin ls sometimes?

Mayeu · 2025-01-08T14:51:50Z

@hsanjuan I can't go back in the logs before the 5th of January so I'm not sure why the new node was stuck between the 22nd of December and the 5th of January.

Between the 5th and today there are 4184 pin/ls error in the logs, the last ones being yesterday morning. So it is still happening yes :/

hsanjuan · 2025-01-09T13:18:45Z

and the errors are context.Cancelled? How long does pin ls take when you call it manually ?

hsanjuan · 2025-01-09T13:20:09Z

Also, assume you call it manually and it is not streaming anything at all for a few minutes... can you ipfs diag profile then and post the result? Or look yourself which goroutine is hanging for that amount of time and on which lock it is waiting.

Mayeu · 2025-01-09T16:51:03Z

and the errors are context.Cancelled?

Yes, the errors are the context.Cancelled one.

How long does pin ls take when you call it manually ?

Previously when the issue arose with LevelDB, it "never" finished ("never" as in I stopped curl after 24h). But I don't yet have been able to catch it myself with Pebble. I don't think it will "never" finish anymore since it seems to happen regularly and then resolve itself as this number of error per day shows:

date              _count
--------------------------------
2025-01-07        3848
2025-01-08        463
2025-01-09        195

I'll script something to triggers a diagnostic if a curl doesn't finish after a few minutes, because I'm not sure I can react myself to those events.

FYI (and mine in 6 months), I'm getting those numbers with angle-grinder:

journalctl -u ipfs-cluster  --grep "ERROR.*ipfshttp.*/pin/ls.*context canceled" | agrind '* | parse "* * * * *[*]: *T*Z*" as month, day, time, server, process, pid, date, time, msg | count by date | sort by date'

Mayeu added kind/bug A bug in existing code (including security flaws) need/triage Needs initial labeling and prioritization labels Nov 25, 2024

hsanjuan self-assigned this Nov 25, 2024

gammazero added need/author-input Needs input from the original author and removed need/triage Needs initial labeling and prioritization labels Dec 10, 2024

lidel added need/author-input Needs input from the original author and removed need/author-input Needs input from the original author labels Dec 17, 2024

github-actions bot added the kind/stale label Dec 25, 2024

lidel removed need/author-input Needs input from the original author kind/stale labels Dec 25, 2024

lidel added the need/author-input Needs input from the original author label Dec 25, 2024

github-actions bot added the kind/stale label Jan 1, 2025

github-actions bot removed the kind/stale label Jan 8, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Kubo stopped listing pin after some times #10596

Kubo stopped listing pin after some times #10596

Mayeu commented Nov 25, 2024 •

edited

Loading

11qu1d commented Nov 25, 2024

hsanjuan commented Dec 10, 2024

Mayeu commented Dec 12, 2024

hsanjuan commented Dec 12, 2024

Mayeu commented Dec 13, 2024

Mayeu commented Dec 13, 2024

ehsan6sha commented Dec 15, 2024

Mayeu commented Dec 18, 2024

github-actions bot commented Dec 25, 2024

lidel commented Dec 25, 2024

github-actions bot commented Jan 1, 2025

Mayeu commented Jan 6, 2025 •

edited

Loading

Mayeu commented Jan 7, 2025

hsanjuan commented Jan 7, 2025

hsanjuan commented Jan 7, 2025

gammazero commented Jan 7, 2025

Mayeu commented Jan 8, 2025

hsanjuan commented Jan 8, 2025

Mayeu commented Jan 8, 2025

hsanjuan commented Jan 9, 2025

hsanjuan commented Jan 9, 2025

Mayeu commented Jan 9, 2025 •

edited

Loading

Kubo stopped listing pin after some times #10596

Kubo stopped listing pin after some times #10596

Comments

Mayeu commented Nov 25, 2024 • edited Loading

Checklist

Installation method

Version

Config

Description

11qu1d commented Nov 25, 2024

hsanjuan commented Dec 10, 2024

Mayeu commented Dec 12, 2024

hsanjuan commented Dec 12, 2024

Mayeu commented Dec 13, 2024

Mayeu commented Dec 13, 2024

ehsan6sha commented Dec 15, 2024

Mayeu commented Dec 18, 2024

github-actions bot commented Dec 25, 2024

lidel commented Dec 25, 2024

github-actions bot commented Jan 1, 2025

Mayeu commented Jan 6, 2025 • edited Loading

Mayeu commented Jan 7, 2025

hsanjuan commented Jan 7, 2025

hsanjuan commented Jan 7, 2025

gammazero commented Jan 7, 2025

Mayeu commented Jan 8, 2025

hsanjuan commented Jan 8, 2025

Mayeu commented Jan 8, 2025

hsanjuan commented Jan 9, 2025

hsanjuan commented Jan 9, 2025

Mayeu commented Jan 9, 2025 • edited Loading

Mayeu commented Nov 25, 2024 •

edited

Loading

Mayeu commented Jan 6, 2025 •

edited

Loading

Mayeu commented Jan 9, 2025 •

edited

Loading