Skip to content

Commit 6f13bc6

Browse files
committed
added plotting functionality
1 parent c5b6199 commit 6f13bc6

File tree

2 files changed

+85
-12
lines changed

2 files changed

+85
-12
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,10 @@ __pycache__/
88
# testing
99
test/*
1010
test.py
11-
11+
data.xml
1212

1313
# ignore results
14+
presidencyScraper/PresidencyScraper*
1415
presidencyScraper/PresidencyScraperResult/*
1516
presidencyScraper/PresidencyScraperResult_600Biden/*
17+

presidencyScraper/presidencyScraper.py

Lines changed: 82 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from collections import defaultdict
2323
import json
2424
import logging
25+
from operator import itemgetter
2526
from pathlib import Path
2627
import time
2728
from zipfile import ZipFile
@@ -69,7 +70,8 @@ class PresidencyScraper():
6970
unknownID = 'unknown'
7071

7172

72-
def __init__(self, initialURL:str, timeout:float=1.0, logLevel:int=20, override:bool=True, include:dict[str, list]={}, exclude:dict[str, list]={}):
73+
def __init__(self, initialURL:str, timeout:float=1.0, logLevel:int=20, override:bool=True, include:dict[str, list]={}, exclude:dict[str, list]={},
74+
customRootDir:str=None):
7375
self.initialURL = initialURL
7476
self.timeout = timeout
7577
self.include = include
@@ -80,7 +82,7 @@ def __init__(self, initialURL:str, timeout:float=1.0, logLevel:int=20, override:
8082

8183
self._checkInitialURL()
8284
self._checkIncludeExclude()
83-
self.directories = self._getDirectories(override)
85+
self.directories = self._getDirectories(override, customRootDir)
8486

8587
self.logger = self._setLogger(logLevel)
8688
self.logger.info(f'{self.__class__.__qualname__} initialized')
@@ -138,13 +140,15 @@ def _setLogger(logLevel:int) -> logging.Logger:
138140
return logger
139141

140142
@staticmethod
141-
def _getDirectories(override:bool) -> dict[str, Path]:
143+
def _getDirectories(override:bool, customRootDir:str=None) -> dict[str, Path]:
142144
"""The method creates a dictionary with the paths to the directories and files that will be created during the scraping process."""
143145

144-
if override:
146+
if customRootDir:
147+
root = customRootDir
148+
elif override:
145149
root = 'PresidencyScraperResult'
146150
else:
147-
root = 'PresidencyScraperResult' + datetime.now().strftime("%Y-%m-$d_%H-%M-%S")
151+
root = 'PresidencyScraperResult' + datetime.now().strftime("%Y-%m-%D_%H-%M-%S")
148152

149153

150154
rootDir = Path(__file__).parent / root
@@ -157,7 +161,7 @@ def _getDirectories(override:bool) -> dict[str, Path]:
157161
'metadataExcel': rootDir / 'metadata.xlsx',
158162
'csv': rootDir / 'search_results.csv',
159163
'scrapedWebsites': rootDir / 'scrapedWebsites.txt',
160-
'zip': rootDir / 'documents.zip'}
164+
'zip': rootDir / 'corpora.zip'}
161165

162166
directory['scrapedWebsites'].touch()
163167
directory['content'].touch()
@@ -368,7 +372,7 @@ def resultToDataframe(self) -> None:
368372

369373
contentAgg = defaultdict(list)
370374

371-
populationPath = 'presidencyScraper/USPopulation/SUB-EST2020_ALL.csv'
375+
populationPath = 'presidencyScraper/USPopulation/SUB-EST2020_ALL_adj.csv'
372376
self.populationDF = pd.read_csv(populationPath, sep=',')
373377

374378

@@ -387,10 +391,14 @@ def resultToDataframe(self) -> None:
387391

388392
df = pd.DataFrame.from_dict(contentAgg)
389393

394+
390395
df['population'] = df.apply(lambda row: self._addPopulationCount(self.populationDF, row['state'], row['city']), axis=1)
391396

392397
df = df[['speaker', 'date', 'state', 'city', 'population', 'title', 'citation', 'categories', 'link']]
393398

399+
400+
df['date'] = pd.to_datetime(df['date'], format='%B %d, %Y').dt.date
401+
394402
df.to_csv(self.directories['metadataCSV'])
395403
df.to_excel(self.directories['metadataExcel'])
396404

@@ -431,6 +439,7 @@ def _addPopulationCount(populationDF:pd.DataFrame, state:str, city:str) -> int:
431439
# get a list of all elements of the population column where the state matches
432440
result = cityDF.loc[populationDF['STNAME'] == state].get(populationCol).to_list()
433441

442+
434443
if result:
435444
population = int(result[-1])
436445
else:
@@ -439,17 +448,79 @@ def _addPopulationCount(populationDF:pd.DataFrame, state:str, city:str) -> int:
439448
return population
440449

441450

451+
def analyzeMetadata(self, path:str = None) -> None:
452+
import matplotlib.pyplot as plt
453+
import seaborn as sns
454+
455+
456+
if path is None:
457+
path = self.directories['metadataCSV']
458+
else:
459+
path = Path(path)
460+
461+
df = pd.read_csv(path)
462+
463+
output = path.parent / 'visualization.png'
464+
465+
partyColor = {'John McCain': 'red', 'Barack Obama': 'blue',
466+
'Mitt Romney': 'red', 'Hillary Clinton': 'blue',
467+
'Donald J. Trump': 'red', 'Joseph R. Biden, Jr.': 'blue',
468+
'Kamala Harris': 'blue'}
469+
470+
471+
sns.set_style('darkgrid')
472+
fig, axes = plt.subplots(3, 1, figsize=(14, 12))
473+
474+
# speeches per candidate
475+
speakers = df['speaker'].value_counts()
476+
477+
ig = itemgetter(*speakers.index)
478+
479+
axes[0].bar(speakers.index, speakers.values, color=ig(partyColor))
480+
axes[0].set_title('Number of Speeches per Candidate')
481+
482+
483+
# speeches per date
484+
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
485+
df['month'] = df['date'].dt.to_period('M') # Convert to monthly period
486+
487+
speechesDate = df['month'].value_counts(sort=False)
488+
489+
axes[1].scatter(speechesDate.index.to_timestamp(), speechesDate.values, color='orange')
490+
491+
axes[1].set_title('Speeches in the Dataset')
492+
axes[1].set_ylim(bottom=0)
493+
axes[1].set_xlim([speechesDate.index[0], speechesDate.index[-1]])
494+
495+
496+
# speeches per state
497+
states = df['state'].value_counts()
498+
499+
axes[2].bar(states.index, states.values, color='green')
500+
axes[2].set_title('State of the Speech')
501+
axes[2].set_xlim([0, len(states)])
502+
axes[2].tick_params(axis='x', rotation=45, labelsize=8)
503+
504+
# axes[2].pie(states.values, labels=states.index)
505+
# axes[2].set_title('State of the Speech')
506+
507+
508+
plt.tight_layout()
509+
# plt.show()
510+
plt.savefig(output)
511+
plt.close()
512+
return None
513+
442514

443515

444516
if __name__ == '__main__':
445517

446-
url = "https://www.presidency.ucsb.edu/advanced-search?field-keywords=&field-keywords2=&field-keywords3=&from%5Bdate%5D=01-01-2008&to%5Bdate%5D=10-28-2024&person2=&category2%5B0%5D=63&items_per_page=100&f%5B0%5D=field_docs_attributes%3A205"
518+
url = "https://www.presidency.ucsb.edu/advanced-search?field-keywords=&field-keywords2=&field-keywords3=&from%5Bdate%5D=01-01-2008&to%5Bdate%5D=11-08-2024&person2=&category2%5B0%5D=63&items_per_page=100&f%5B0%5D=field_docs_attributes%3A205"
447519

448520
include = {'speaker': ['John McCain', 'Barack Obama', 'Mitt Romney', 'Hillary Clinton', 'Donald J. Trump', 'Joseph R. Biden, Jr.', 'Kamala Harris']}
449521
exclude = {'title_substring': ['Press Release']}
450522

451-
scraper = PresidencyScraper(url, timeout=2.1, include=include, exclude=exclude)
452-
453-
scraper.scrape(limit=8)
523+
scraper = PresidencyScraper(url, timeout=1.5, include=include, exclude=exclude, override=False, customRootDir='PresidencyScraperResult2024-11-09_22-01-49')
454524

525+
scraper.scrape(limit=20)
455526

0 commit comments

Comments
 (0)