1313from collections import defaultdict
1414
1515# Third-party
16+ from operator import itemgetter
1617import requests
1718from pygments import highlight
1819from pygments .formatters import TerminalFormatter
4344HEADER1_COUNT = ["TOOL IDENTIFIER" , "COUNT" ]
4445HEADER2_MEDIA = ["TOOL IDENTIFIER" , "MEDIA TYPE" , "COUNT" ]
4546HEADER3_RECORD = ["TOOL IDENTIFIER" , "RECORD TYPE" , "COUNT" ]
46- MAX_PER_PAGE = 100 # Pagination limit as defined by the API documentation
4747QUARTER = os .path .basename (PATHS ["data_quarter" ])
4848RECORD_TYPES = [
4949 "article" ,
@@ -69,6 +69,12 @@ def parse_arguments():
6969 action = "store_true" ,
7070 help = "Enable git actions (fetch, merge, add, commit, and push)" ,
7171 )
72+ parser .add_argument (
73+ "--limit" ,
74+ type = int ,
75+ default = None ,
76+ help = "Maximum number of records to fetch per each record type" ,
77+ )
7278 args = parser .parse_args ()
7379 if not args .enable_save and args .enable_git :
7480 parser .error ("--enable-git requires --enable-save" )
@@ -91,13 +97,6 @@ def get_requests_session():
9197 return session
9298
9399
94- def sanitize_string (s ):
95- """Replaces newline and carriage return characters with a space."""
96- if isinstance (s , str ):
97- return s .replace ("\n " , " " ).replace ("\r " , "" )
98- return s
99-
100-
101100def initialize_data_file (file_path , header ):
102101 if not os .path .isfile (file_path ):
103102 with open (file_path , "w" , encoding = "utf-8" , newline = "\n " ) as file_obj :
@@ -163,7 +162,7 @@ def write_counts_to_csv(args, data: dict):
163162 writer .writerows (rows )
164163
165164
166- def fetch_museums_victoria_data (session ):
165+ def fetch_museums_victoria_data (args , session ):
167166 """
168167 Fetches all records with images from the Museums Victoria API by iterating
169168 through all record types and handling pagination.
@@ -172,23 +171,33 @@ def fetch_museums_victoria_data(session):
172171 record_counts = defaultdict (lambda : defaultdict (int ))
173172 media_counts = defaultdict (lambda : defaultdict (int ))
174173 licences_count = defaultdict (int )
174+ total_records_processed = 0
175175
176176 # Iterate through each record type
177177 for record_type in RECORD_TYPES :
178178 current_page = 1
179179 total_pages = None
180+ per_page = 100
181+ if args .limit is not None :
182+ per_page = args .limit
183+ if total_records_processed >= args .limit :
184+ LOGGER .info (
185+ f"Limit Reached: { total_records_processed } processed. Skipping remaining record types."
186+ )
187+ break
180188
181- LOGGER .info (f"--- Starting fetch for: { record_type .upper ()} ---" )
182-
189+ LOGGER .info (
190+ f"fetching page { current_page } of { record_type } s "
191+ f"(records { (current_page * per_page ) - per_page } -"
192+ f"{ current_page * per_page } )"
193+ )
183194 while True :
184195 # 1. Construct the API query parameters
185196 params = {
186- "recordtype" : record_type ,
187- # "perpage": 20,
188- "perpage" : MAX_PER_PAGE ,
189- "page" : current_page ,
190- # "page": 1,
191197 "envelope" : "true" ,
198+ "page" : current_page ,
199+ "perpage" : per_page ,
200+ "recordtype" : record_type ,
192201 }
193202 try :
194203 r = session .get (BASE_URL , params = params , timeout = 30 )
@@ -202,6 +211,7 @@ def fetch_museums_victoria_data(session):
202211 data = r .json ()
203212 results = data .get ("response" , [])
204213 for res in results :
214+ total_records_processed += 1
205215 media_list = res .get ("media" , [])
206216 for media_item in media_list :
207217 licence_data = media_item .get ("licence" )
@@ -219,26 +229,38 @@ def fetch_museums_victoria_data(session):
219229 record_counts [record_type ][license_short_name ] += 1
220230 if total_pages is None :
221231 headers = data .get ("headers" , {})
222- # total_pages = 1
223232 total_pages = int (headers .get ("totalResults" , "0" ))
224233
234+ if args .limit is not None and total_records_processed >= per_page :
235+ break
225236 current_page += 1
237+
226238 if current_page > total_pages :
227239 break
240+
228241 return {
229- FILE1_COUNT : licences_count ,
230- FILE2_MEDIA : media_counts ,
231- FILE3_RECORD : record_counts ,
242+ FILE1_COUNT : dict ( sorted ( licences_count . items ())) ,
243+ FILE2_MEDIA : sort_nested_defaultdict ( media_counts ) ,
244+ FILE3_RECORD : sort_nested_defaultdict ( record_counts ) ,
232245 }
233246
247+ def sort_nested_defaultdict (d ):
248+ """Convert defaultdicts to regular dicts and sort all keys recursively."""
249+ if isinstance (d , defaultdict ):
250+ d = {k : sort_nested_defaultdict (v ) for k , v in sorted (d .items ())}
251+ elif isinstance (d , dict ):
252+ d = {k : sort_nested_defaultdict (v ) for k , v in sorted (d .items ())}
253+ return d
254+
255+
234256
235257def main ():
236258 args = parse_arguments ()
237259 shared .paths_log (LOGGER , PATHS )
238260 shared .git_fetch_and_merge (args , PATHS ["repo" ])
239261 initialize_all_data_files (args )
240262 session = get_requests_session ()
241- data = fetch_museums_victoria_data (session )
263+ data = fetch_museums_victoria_data (args , session )
242264 write_counts_to_csv (args , data )
243265 args = shared .git_add_and_commit (
244266 args ,
0 commit comments