2222from collections import defaultdict
2323import json
2424import logging
25+ from operator import itemgetter
2526from pathlib import Path
2627import time
2728from zipfile import ZipFile
@@ -69,7 +70,8 @@ class PresidencyScraper():
6970 unknownID = 'unknown'
7071
7172
72- def __init__ (self , initialURL :str , timeout :float = 1.0 , logLevel :int = 20 , override :bool = True , include :dict [str , list ]= {}, exclude :dict [str , list ]= {}):
73+ def __init__ (self , initialURL :str , timeout :float = 1.0 , logLevel :int = 20 , override :bool = True , include :dict [str , list ]= {}, exclude :dict [str , list ]= {},
74+ customRootDir :str = None ):
7375 self .initialURL = initialURL
7476 self .timeout = timeout
7577 self .include = include
@@ -80,7 +82,7 @@ def __init__(self, initialURL:str, timeout:float=1.0, logLevel:int=20, override:
8082
8183 self ._checkInitialURL ()
8284 self ._checkIncludeExclude ()
83- self .directories = self ._getDirectories (override )
85+ self .directories = self ._getDirectories (override , customRootDir )
8486
8587 self .logger = self ._setLogger (logLevel )
8688 self .logger .info (f'{ self .__class__ .__qualname__ } initialized' )
@@ -138,13 +140,15 @@ def _setLogger(logLevel:int) -> logging.Logger:
138140 return logger
139141
140142 @staticmethod
141- def _getDirectories (override :bool ) -> dict [str , Path ]:
143+ def _getDirectories (override :bool , customRootDir : str = None ) -> dict [str , Path ]:
142144 """The method creates a dictionary with the paths to the directories and files that will be created during the scraping process."""
143145
144- if override :
146+ if customRootDir :
147+ root = customRootDir
148+ elif override :
145149 root = 'PresidencyScraperResult'
146150 else :
147- root = 'PresidencyScraperResult' + datetime .now ().strftime ("%Y-%m-$d_ %H-%M-%S" )
151+ root = 'PresidencyScraperResult' + datetime .now ().strftime ("%Y-%m-%D_ %H-%M-%S" )
148152
149153
150154 rootDir = Path (__file__ ).parent / root
@@ -157,7 +161,7 @@ def _getDirectories(override:bool) -> dict[str, Path]:
157161 'metadataExcel' : rootDir / 'metadata.xlsx' ,
158162 'csv' : rootDir / 'search_results.csv' ,
159163 'scrapedWebsites' : rootDir / 'scrapedWebsites.txt' ,
160- 'zip' : rootDir / 'documents .zip' }
164+ 'zip' : rootDir / 'corpora .zip' }
161165
162166 directory ['scrapedWebsites' ].touch ()
163167 directory ['content' ].touch ()
@@ -368,7 +372,7 @@ def resultToDataframe(self) -> None:
368372
369373 contentAgg = defaultdict (list )
370374
371- populationPath = 'presidencyScraper/USPopulation/SUB-EST2020_ALL .csv'
375+ populationPath = 'presidencyScraper/USPopulation/SUB-EST2020_ALL_adj .csv'
372376 self .populationDF = pd .read_csv (populationPath , sep = ',' )
373377
374378
@@ -387,10 +391,14 @@ def resultToDataframe(self) -> None:
387391
388392 df = pd .DataFrame .from_dict (contentAgg )
389393
394+
390395 df ['population' ] = df .apply (lambda row : self ._addPopulationCount (self .populationDF , row ['state' ], row ['city' ]), axis = 1 )
391396
392397 df = df [['speaker' , 'date' , 'state' , 'city' , 'population' , 'title' , 'citation' , 'categories' , 'link' ]]
393398
399+
400+ df ['date' ] = pd .to_datetime (df ['date' ], format = '%B %d, %Y' ).dt .date
401+
394402 df .to_csv (self .directories ['metadataCSV' ])
395403 df .to_excel (self .directories ['metadataExcel' ])
396404
@@ -431,6 +439,7 @@ def _addPopulationCount(populationDF:pd.DataFrame, state:str, city:str) -> int:
431439 # get a list of all elements of the population column where the state matches
432440 result = cityDF .loc [populationDF ['STNAME' ] == state ].get (populationCol ).to_list ()
433441
442+
434443 if result :
435444 population = int (result [- 1 ])
436445 else :
@@ -439,17 +448,79 @@ def _addPopulationCount(populationDF:pd.DataFrame, state:str, city:str) -> int:
439448 return population
440449
441450
451+ def analyzeMetadata (self , path :str = None ) -> None :
452+ import matplotlib .pyplot as plt
453+ import seaborn as sns
454+
455+
456+ if path is None :
457+ path = self .directories ['metadataCSV' ]
458+ else :
459+ path = Path (path )
460+
461+ df = pd .read_csv (path )
462+
463+ output = path .parent / 'visualization.png'
464+
465+ partyColor = {'John McCain' : 'red' , 'Barack Obama' : 'blue' ,
466+ 'Mitt Romney' : 'red' , 'Hillary Clinton' : 'blue' ,
467+ 'Donald J. Trump' : 'red' , 'Joseph R. Biden, Jr.' : 'blue' ,
468+ 'Kamala Harris' : 'blue' }
469+
470+
471+ sns .set_style ('darkgrid' )
472+ fig , axes = plt .subplots (3 , 1 , figsize = (14 , 12 ))
473+
474+ # speeches per candidate
475+ speakers = df ['speaker' ].value_counts ()
476+
477+ ig = itemgetter (* speakers .index )
478+
479+ axes [0 ].bar (speakers .index , speakers .values , color = ig (partyColor ))
480+ axes [0 ].set_title ('Number of Speeches per Candidate' )
481+
482+
483+ # speeches per date
484+ df ['date' ] = pd .to_datetime (df ['date' ], format = '%Y-%m-%d' )
485+ df ['month' ] = df ['date' ].dt .to_period ('M' ) # Convert to monthly period
486+
487+ speechesDate = df ['month' ].value_counts (sort = False )
488+
489+ axes [1 ].scatter (speechesDate .index .to_timestamp (), speechesDate .values , color = 'orange' )
490+
491+ axes [1 ].set_title ('Speeches in the Dataset' )
492+ axes [1 ].set_ylim (bottom = 0 )
493+ axes [1 ].set_xlim ([speechesDate .index [0 ], speechesDate .index [- 1 ]])
494+
495+
496+ # speeches per state
497+ states = df ['state' ].value_counts ()
498+
499+ axes [2 ].bar (states .index , states .values , color = 'green' )
500+ axes [2 ].set_title ('State of the Speech' )
501+ axes [2 ].set_xlim ([0 , len (states )])
502+ axes [2 ].tick_params (axis = 'x' , rotation = 45 , labelsize = 8 )
503+
504+ # axes[2].pie(states.values, labels=states.index)
505+ # axes[2].set_title('State of the Speech')
506+
507+
508+ plt .tight_layout ()
509+ # plt.show()
510+ plt .savefig (output )
511+ plt .close ()
512+ return None
513+
442514
443515
444516if __name__ == '__main__' :
445517
446- url = "https://www.presidency.ucsb.edu/advanced-search?field-keywords=&field-keywords2=&field-keywords3=&from%5Bdate%5D=01-01-2008&to%5Bdate%5D=10-28 -2024&person2=&category2%5B0%5D=63&items_per_page=100&f%5B0%5D=field_docs_attributes%3A205"
518+ url = "https://www.presidency.ucsb.edu/advanced-search?field-keywords=&field-keywords2=&field-keywords3=&from%5Bdate%5D=01-01-2008&to%5Bdate%5D=11-08 -2024&person2=&category2%5B0%5D=63&items_per_page=100&f%5B0%5D=field_docs_attributes%3A205"
447519
448520 include = {'speaker' : ['John McCain' , 'Barack Obama' , 'Mitt Romney' , 'Hillary Clinton' , 'Donald J. Trump' , 'Joseph R. Biden, Jr.' , 'Kamala Harris' ]}
449521 exclude = {'title_substring' : ['Press Release' ]}
450522
451- scraper = PresidencyScraper (url , timeout = 2.1 , include = include , exclude = exclude )
452-
453- scraper .scrape (limit = 8 )
523+ scraper = PresidencyScraper (url , timeout = 1.5 , include = include , exclude = exclude , override = False , customRootDir = 'PresidencyScraperResult2024-11-09_22-01-49' )
454524
525+ scraper .scrape (limit = 20 )
455526
0 commit comments