33import pandas as pd
44import jwt
55import re
6+ import logging
67
78
89class Gen3MetadataParser :
910 """
1011 A class to interact with Gen3 metadata API for fetching and processing data.
1112 """
1213
13- def __init__ (self , key_file_path ):
14+ def __init__ (self , key_file_path , logger = None ):
1415 """
1516 Initializes the Gen3MetadataParser with API URL and key file path.
1617
1718 Args:
1819 key_file_path (str): The file path to the JSON key file for authentication.
20+ logger (logging.Logger, optional): Logger instance to use. If None, uses default.
1921 """
2022 self .key_file_path = key_file_path
2123 self .headers = {}
2224 self .data_store = {}
2325 self .data_store_pd = {}
24-
26+ if logger is None :
27+ self .logger = logging .getLogger ("gen3_metadata" )
28+ else :
29+ self .logger = logger
30+ self .logger .info (f"Initialized Gen3MetadataParser with key file: { key_file_path } " )
31+
2532 def _add_quotes_to_json (self , input_str ):
2633 try :
2734 # Try parsing as-is
35+ self .logger .debug ("Attempting to parse JSON as-is." )
2836 return json .loads (input_str )
2937 except json .JSONDecodeError :
38+ self .logger .warning ("JSON decode failed, attempting to fix missing quotes in JSON." )
3039 # Add quotes around keys
3140 fixed = re .sub (r'([{,]\s*)(\w+)\s*:' , r'\1"\2":' , input_str )
3241 # Add quotes around simple string values (skip existing quoted values)
3342 fixed = re .sub (r':\s*([A-Za-z0-9._:@/-]+)(?=\s*[},])' , r': "\1"' , fixed )
3443 try :
44+ self .logger .debug ("Trying to parse fixed JSON string." )
3545 return json .loads (fixed )
3646 except json .JSONDecodeError as e :
47+ self .logger .error (f"Could not fix JSON: { e } " )
3748 raise ValueError (f"Could not fix JSON: { e } " )
3849
3950 def _load_api_key (self ) -> dict :
@@ -44,24 +55,30 @@ def _load_api_key(self) -> dict:
4455 dict: The API key loaded from the JSON file.
4556 """
4657 try :
58+ self .logger .info (f"Loading API key from file: { self .key_file_path } " )
4759 # Read the file as plain text
4860 with open (self .key_file_path , "r" ) as f :
4961 content = f .read ()
5062 # If the content does not contain any double or single quotes, try to fix it
5163 if '"' not in content and "'" not in content :
64+ self .logger .warning ("API key file appears to lack quotes, attempting to fix." )
5265 return self ._add_quotes_to_json (content )
5366
5467 # Read the file as JSON
5568 with open (self .key_file_path ) as json_file :
69+ self .logger .debug ("Parsing API key file as JSON." )
5670 return json .load (json_file )
5771 except FileNotFoundError as fnf_err :
72+ self .logger .error (f"File not found: { fnf_err } " )
5873 print (f"File not found: { fnf_err } " )
5974 raise
6075 except json .JSONDecodeError as json_err :
76+ self .logger .error (f"JSON decode error: { json_err } " )
6177 print (f"JSON decode error: { json_err } " )
6278 print ("Please make sure the file contains valid JSON with quotes and proper formatting." )
6379 raise
6480 except Exception as err :
81+ self .logger .error (f"An unexpected error occurred while loading API key: { err } " )
6582 print (f"An unexpected error occurred while loading API key: { err } " )
6683 raise
6784
@@ -76,10 +93,11 @@ def _url_from_jwt(self, cred: dict) -> str:
7693 str: The extracted URL.
7794 """
7895 jwt_token = cred ['api_key' ]
96+ self .logger .debug ("Decoding JWT to extract API URL." )
7997 url = jwt .decode (jwt_token , options = {"verify_signature" : False }).get ('iss' , '' ).removesuffix ("/user" )
98+ self .logger .info (f"Extracted API URL from JWT: { url } " )
8099 return url
81100
82-
83101 def authenticate (self ) -> dict :
84102 """
85103 Authenticates with the Gen3 API using the loaded API key.
@@ -88,30 +106,43 @@ def authenticate(self) -> dict:
88106 dict: Headers containing the authorization token.
89107 """
90108 try :
109+ self .logger .info ("Starting authentication process." )
91110 key = self ._load_api_key ()
92111 api_url = self ._url_from_jwt (key )
112+ self .logger .info (f"Sending authentication request to: { api_url } /user/credentials/cdis/access_token" )
93113 response = requests .post (
94114 f"{ api_url } /user/credentials/cdis/access_token" , json = key
95115 )
116+ self .logger .debug (f"Authentication response status code: { response .status_code } " )
96117 response .raise_for_status ()
97118 access_token = response .json ()['access_token' ]
98119 self .headers = {'Authorization' : f"bearer { access_token } " }
99- return print (f"Authentication successful: { response .status_code } " )
120+ self .logger .info (f"Authentication successful. Access token received. Status code: { response .status_code } " )
121+ print (f"Authentication successful: { response .status_code } " )
100122 except requests .exceptions .HTTPError as http_err :
123+ self .logger .error (
124+ f"HTTP error occurred during authentication: { http_err } - "
125+ f"Status Code: { getattr (http_err .response , 'status_code' , 'N/A' )} "
126+ )
101127 print (
102128 f"HTTP error occurred during authentication: { http_err } - "
103- f"Status Code: { response . status_code } "
129+ f"Status Code: { getattr ( http_err . response , ' status_code' , 'N/A' ) } "
104130 )
105131 raise
106132 except requests .exceptions .RequestException as req_err :
133+ self .logger .error (f"Request error occurred during authentication: { req_err } " )
107134 print (f"Request error occurred during authentication: { req_err } " )
108135 raise
109136 except KeyError as key_err :
137+ self .logger .error (
138+ f"Key error: { key_err } - The response may not contain 'access_token'"
139+ )
110140 print (
111141 f"Key error: { key_err } - The response may not contain 'access_token'"
112142 )
113143 raise
114144 except Exception as err :
145+ self .logger .error (f"An unexpected error occurred during authentication: { err } " )
115146 print (f"An unexpected error occurred during authentication: { err } " )
116147 raise
117148
@@ -125,6 +156,7 @@ def json_to_pd(self, json_data) -> pd.DataFrame:
125156 Returns:
126157 pandas.DataFrame: The converted pandas DataFrame.
127158 """
159+ self .logger .debug ("Converting JSON data to pandas DataFrame." )
128160 return pd .json_normalize (json_data )
129161
130162 def fetch_data (
@@ -146,41 +178,63 @@ def fetch_data(
146178 dict or None: The fetched data if return_data is True, otherwise None.
147179 """
148180 try :
181+ self .logger .info (
182+ f"Fetching data for program: { program_name } , project: { project_code } , "
183+ f"node: { node_label } , API version: { api_version } "
184+ )
149185 creds = self ._load_api_key ()
150186 api_url = self ._url_from_jwt (creds )
151187 url = (
152188 f"{ api_url } /api/{ api_version } /submission/{ program_name } /{ project_code } /"
153189 f"export/?node_label={ node_label } &format=json"
154190 )
191+ self .logger .info (f"GET request to URL: { url } " )
155192 response = requests .get (url , headers = self .headers )
193+ self .logger .info (f"Fetch data response status code: { response .status_code } " )
156194 print (f"status code: { response .status_code } " )
157195 response .raise_for_status ()
158196 data = response .json ()
159197
160198 key = f"{ program_name } /{ project_code } /{ node_label } "
161199 self .data_store [key ] = data
200+ self .logger .info (f"Data for { key } has been fetched and stored in data_store." )
162201
163202 if return_data :
203+ self .logger .debug (f"Returning fetched data for { key } ." )
164204 return data
165205 else :
206+ self .logger .info (f"Data for { key } has been fetched and stored." )
166207 print (f"Data for { key } has been fetched and stored." )
167208 except requests .exceptions .HTTPError as http_err :
209+ self .logger .error (
210+ f"HTTP error occurred: { http_err } - "
211+ f"Status Code: { getattr (http_err .response , 'status_code' , 'N/A' )} "
212+ )
168213 print (
169214 f"HTTP error occurred: { http_err } - "
170- f"Status Code: { response . status_code } "
215+ f"Status Code: { getattr ( http_err . response , ' status_code' , 'N/A' ) } "
171216 )
172217 raise
173218 except Exception as err :
219+ self .logger .error (f"An error occurred while fetching data: { err } " )
174220 print (f"An error occurred: { err } " )
175221 raise
176222
177223 def data_to_pd (self ) -> None :
178224 """
179225 Converts all fetched JSON data in the data store to pandas DataFrames.
180226 """
227+ self .logger .info ("Converting all fetched JSON data in data_store to pandas DataFrames." )
181228 for key , value in self .data_store .items ():
229+ self .logger .info (f"Converting { key } to pandas dataframe..." )
182230 print (f"Converting { key } to pandas dataframe..." )
183- self .data_store_pd [key ] = self .json_to_pd (value ['data' ])
231+ try :
232+ self .data_store_pd [key ] = self .json_to_pd (value ['data' ])
233+ self .logger .debug (f"Conversion successful for { key } ." )
234+ except Exception as e :
235+ self .logger .error (f"Failed to convert { key } to pandas DataFrame: { e } " )
236+ print (f"Failed to convert { key } to pandas DataFrame: { e } " )
237+ self .logger .info ("All available data converted to pandas DataFrames." )
184238 return
185239
186240 def fetch_data_pd (self , program_name , project_code , node_label , api_version = "v0" ):
@@ -195,8 +249,22 @@ def fetch_data_pd(self, program_name, project_code, node_label, api_version="v0"
195249 api_version (str, optional): The version of the API to use.
196250 Defaults to "v0".
197251 """
252+ self .logger .info (
253+ f"Fetching data as pandas DataFrame for { program_name } /{ project_code } /{ node_label } "
254+ f"(API version: { api_version } )"
255+ )
198256 data = self .fetch_data (program_name , project_code , node_label , api_version = api_version , return_data = True )
199- return self .json_to_pd (data ['data' ])
257+ try :
258+ df = self .json_to_pd (data ['data' ])
259+ self .logger .info (
260+ f"Successfully converted data to pandas DataFrame for "
261+ f"{ program_name } /{ project_code } /{ node_label } "
262+ )
263+ return df
264+ except Exception as e :
265+ self .logger .error (f"Failed to convert fetched data to pandas DataFrame: { e } " )
266+ print (f"Failed to convert fetched data to pandas DataFrame: { e } " )
267+ raise
200268
201269 def fetch_data_json (self , program_name , project_code , node_label , api_version = "v0" ):
202270 """
@@ -209,4 +277,8 @@ def fetch_data_json(self, program_name, project_code, node_label, api_version="v
209277 api_version (str, optional): The version of the API to use.
210278 Defaults to "v0".
211279 """
280+ self .logger .info (
281+ f"Fetching data as JSON for { program_name } /{ project_code } /{ node_label } "
282+ f"(API version: { api_version } )"
283+ )
212284 return self .fetch_data (program_name , project_code , node_label , api_version = api_version , return_data = True )
0 commit comments