I am interested in accessing the data of igdb.com for a hobby project. They have an api that is “free”, ie it requires a twitch account with 2fa enabled which means i would need to hand over my phone number to twitch. That is a hard no from my side.

Anyone know of a way to get a dump of their database or a similar source where i could get api access without revealing personal information? For my project i am after only title, release year, some sort of rating and some sort of playtime.

    • BakedCatboy@lemmy.ml
      link
      fedilink
      English
      arrow-up
      4
      ·
      edit-2
      18 days ago

      You’re welcome! It was a fun hyperfixation project. I ended up making the script so easy to use I decided to just scrape every other endpoint too, so if anyone wants it, here’s a full dump of every endpoint, it’s only like 4x bigger:

      https://mega.nz/file/YF4F3bCS#pkS8Ki9QuucMGJF65YwGUE-NQZ78QEWs73fmF71qa18

      And if anyone wants to do their own scraping to get more up to date data later, just pip install:

      python-dotenv==1.2.2
      Requests==2.34.2
      tqdm==4.67.1
      

      Put API keys in .env or export env vars:

      CLIENT_ID=<client_id>
      # Provide to fetch new token
      CLIENT_SECRET=<client_secret>
      # Optional, provide to reuse existing access token, secret will not be used
      ACCESS_TOKEN=<access_token>
      

      And just run python dump.py games or any other endpoint in the api docs like release_dates etc. It outputs the json and a simple log to an output folder wherever you ran it. No error handling or checkpointing so if it fails partway through you don’t get anything, but I didn’t have a single error the whole time.

      usage: dump.py [-h] api_route
      
      IGDB Dump Script
      
      positional arguments:
        api_route   The API route to scrape, eg: games or game_time_to_beats
      
      options:
        -h, --help  show this help message and exit
      

      dump.py:

      import argparse
      import json
      import logging
      import os
      import pathlib
      import time
      from dotenv import dotenv_values
      import requests
      from tqdm import tqdm
      
      API_PAGE_SIZE = 500
      OUT_DIR = "output"
      
      config = {
          **dotenv_values(".env"),
          **os.environ,
      }
      
      # Set up flags / args
      parser = argparse.ArgumentParser(
          prog="dump.py", description="IGDB Dump Script"
      )
      parser.add_argument(
          "api_route",
          help="The API route to scrape, eg: games or game_time_to_beats",
      )
      args = parser.parse_args()
      
      # Create out dir
      pathlib.Path.mkdir(OUT_DIR, parents=False, exist_ok=True)
      
      # Set up logging to the route's file
      tqdmHandler = logging.StreamHandler(tqdm)
      tqdmHandler.terminator = ""
      logging.basicConfig(
          level=logging.INFO,
          format="%(message)s",
          handlers=[
              logging.FileHandler(f"{OUT_DIR}/{args.api_route}.log"),
              tqdmHandler
          ],
      )
      
      # Check for existing json to prevent overwriting existing dumps
      outFile = f"{OUT_DIR}/{args.api_route}.json"
      if pathlib.Path(outFile).exists():
          print(f"Existing json found {outFile}, please move or remove it before proceeding")
          exit(1)
      
      if config['CLIENT_ID'] and config['ACCESS_TOKEN']:
          logging.info("Using CLIENT_ID and existing ACCESS_TOKEN")
      elif config['CLIENT_ID'] and config['CLIENT_SECRET'] and not config['ACCESS_TOKEN']:
          logging.info("Fetching new access token...")
          response = requests.post(
              url="https://id.twitch.tv/oauth2/token",
              params={
                  "client_id": config['CLIENT_ID'],
                  "client_secret": config['CLIENT_SECRET'],
                  "grant_type": "client_credentials"
              },
              timeout=30
          )
          config['ACCESS_TOKEN'] = response.json()['access_token']
      else:
          logging.info("Missing CLIENT_ID and CLIENT_SECRET or ACCESS_TOKEN")
          exit(1)
      
      # Re-check access token in case fetch failed
      if config['CLIENT_ID'] and config['ACCESS_TOKEN']:
          items = []
          offset = 0
          logging.info(f"Fetching batches of {API_PAGE_SIZE} on endpoint {args.api_route}")
          with tqdm() as pbar:
              while True:
                  response = requests.post(
                      url=f"https://api.igdb.com/v4/%7Bargs.api_route%7D",
                      headers={
                          "Client-ID": config['CLIENT_ID'],
                          "Authorization": f"Bearer {config['ACCESS_TOKEN']}"
                      },
                      data=f"fields *; limit {API_PAGE_SIZE}; offset {offset};",
                      timeout=30
                  )
                  newItems = response.json()
                  fetchCount = len(newItems)
                  pbar.update(fetchCount)
                  if fetchCount != API_PAGE_SIZE:
                      logging.info(f"WARN: Requested {API_PAGE_SIZE}, got {fetchCount}")
                  offset += API_PAGE_SIZE
                  items.extend(newItems)
                  if fetchCount < API_PAGE_SIZE:
                      logging.info("Received partial page, ending")
                      break
                  time.sleep(1)
      
          logging.info(f"Total fetched: {len(items)}")
          with open(outFile, "w", encoding="utf-8") as file:
              logging.info("Writing to json...")
              json.dump(items, file, ensure_ascii=False, indent=2)
      
          # Print some stats
          logging.info(f"\nChecking json output: {args.api_route}.json")
      
          entries = []
          with open(outFile, "r", encoding="utf-8") as file:
              entries = json.load(file)
      
          logging.info(f"{len(entries)} entries in json")
      
          entryDict = {}
          for entry in entries:
              entryDict.update({entry['id']: entry})
      
          logging.info(f"{len(entryDict)} unique IDs in json")
      else:
          logging.error("Client ID or Access Token not available")
          exit(1)