Fetching All Records from an OAI-PMH Repository Using Python
Here is a script for fetching all records from an OAI-PMH repository using Python. I hope it serves as a useful reference. import requests from requests import Request import xml.etree.ElementTree as ET # Define the endpoint base_url = 'https://curation.library.t.u-tokyo.ac.jp/oai' # Initial OAI-PMH request params = { 'verb': 'ListRecords', 'metadataPrefix': 'curation', 'set': '97590' } response = requests.get(base_url, params=params) # Prepare the initial request req = Request('GET', base_url,params=params) prepared_req = req.prepare() print("Sending request to:", prepared_req.url) # Output the URL root = ET.fromstring(response.content) data = [] # Fetch all data while True: # Process records for record in root.findall('.//{http://www.openarchives.org/OAI/2.0/}record'): identifier = record.find('.//{http://www.openarchives.org/OAI/2.0/}identifier').text print(f'Record ID: {identifier}') # Other data can be processed here as well data.append(record) # Get resumptionToken and execute next request token_element = root.find('.//{http://www.openarchives.org/OAI/2.0/}resumptionToken') if token_element is None or not token_element.text: break # End loop if no token params = { 'verb': 'ListRecords', 'resumptionToken': token_element.text } response = requests.get(base_url, params=params) root = ET.fromstring(response.content) print("All records have been fetched.") print(len(data))