Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4,466 changes: 4,466 additions & 0 deletions faculty_data.json

Large diffs are not rendered by default.

147 changes: 147 additions & 0 deletions web-scraper/History_WebScraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import requests
from bs4 import BeautifulSoup
import time
import json
from urllib.parse import urljoin

BASE_URL = "https://history.yale.edu"
FACULTY_URL = "https://history.yale.edu/people/faculty"

HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/118.0.0.0 Safari/537.36"
)
}

# Fetch HTML & parse
def get_soup(url):
try:
r = requests.get(url, headers=HEADERS, timeout=15)
r.raise_for_status()
return BeautifulSoup(r.text, "html.parser")
except Exception as e:
print(f"Error fetching {url}: {e}")
return None

def get_faculty_list():
faculty = []
page = 0

while True:
url = f"{FACULTY_URL}?page={page}"
print("Loading list page:", url)

soup = get_soup(url)
if not soup:
break

rows = soup.select("table.views-table tbody tr")
if not rows:
break # No more pages

for row in rows:
# Name + profile URL
name_link = row.select_one("td.views-field-name a")
if not name_link:
continue

name = name_link.get_text(strip=True)
profile_url = urljoin(BASE_URL, name_link["href"])

# Fields of interest
interest_cell = row.select_one(
"td.views-field-field-field-s-of-interest"
)
fields_of_interest = (
interest_cell.get_text(" ", strip=True)
if interest_cell
else None
)

faculty.append({
"name": name,
"profile_url": profile_url,
"fields_of_interest": fields_of_interest
})

page += 1
time.sleep(0.5)

return faculty

# Extract full bio
def extract_full_bio(profile_url):
soup = get_soup(profile_url)
if not soup:
return None

# Find the bio section - it's in a div with a label "Bio:"
bio_parts = []

# Look for all field items in the main content area
# The bio appears after the "Bio:" label in the field content
labels = soup.find_all("div", class_="field-label")

for label in labels:
if label.get_text(strip=True) == "Bio:":
# Get the next sibling which should be the field-items div
field_items = label.find_next_sibling("div", class_="field-items")
if field_items:
# Extract all text, preserving paragraph breaks
paragraphs = field_items.find_all("p")
if paragraphs:
for p in paragraphs:
text = p.get_text(strip=True)
if text:
bio_parts.append(text)
else:
# If no paragraphs, just get all text
text = field_items.get_text(strip=True)
if text:
bio_parts.append(text)
break

# Also check for publications and awards if present
publications = []
for label in soup.find_all("strong"):
if "Publications" in label.get_text():
parent = label.find_parent()
if parent:
items = parent.find_all("li")
for item in items:
publications.append(item.get_text(strip=True))

# Join bio parts
bio_text = "\n\n".join(bio_parts) if bio_parts else None

return bio_text
# Scrape all faculty
def scrape_all():
faculty = get_faculty_list()
print(f"Found {len(faculty)} faculty")

results = []

for i, person in enumerate(faculty, 1):
print(f"[{i}/{len(faculty)}] Scraping {person['name']}")
bio = extract_full_bio(person["profile_url"])

results.append({
"name": person["name"],
"department": "History",
"profile_url": person["profile_url"],
"fields_of_interest": person["fields_of_interest"],
"bio": bio
})

time.sleep(0.5) # polite scraping

with open("yale_history_faculty.json", "w", encoding="utf-8") as f:
json.dump(results, f, indent=2, ensure_ascii=False)

print(f"Saved yale_history_faculty.json with {len(results)} faculty members")

if __name__ == "__main__":
scrape_all()
78 changes: 78 additions & 0 deletions web-scraper/Web_Scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""
Scrapes the yale directory website in order to find information about all listed professors
Eventually will create default RDB listings for all professors on the directory website
"""

import requests
import time
from selenium import webdriver
from bs4 import BeautifulSoup
from string import ascii_lowercase

BASE_URL = "https://physics.yale.edu/people"

driver = webdriver.Chrome()

def getURL(lastName):
return BASE_URL + lastName

def getSite(lastName, maxSearchDuration = 3):
driver.get(getURL(lastName))
searchDuration = 0
while(((getSoup(driver.page_source).find(id = 'loading-indicator') == None) | ('inline' in getSoup(driver.page_source).find(id = 'loading-indicator')['style'])) & (searchDuration < maxSearchDuration)):
time.sleep(0.1)
searchDuration += 0.1
return driver.page_source

def getSoup(site):
return BeautifulSoup(site, 'html.parser')

def addListings(listings, nameStr = '', startChar = 'a', endChar = 'c', display = False):
for c in ascii_lowercase[ascii_lowercase.index(startChar):(ascii_lowercase.index(endChar) + 1)]:
soup = getSoup(getSite(nameStr + c))

resultsText = soup.find(id = 'results-people-header').text

numResults = int(resultsText.split(' ')[0]) if resultsText.split(' ')[0].isdigit() else 1 if 'display: none' in soup.find(id = 'bps-result-region')['style'].split(';') else 0

if(display):
if(numResults == 25):
print(f'Searching "{nameStr + c}"... Found {numResults} results')

surplusResults = numResults != 1 and 'display: block' in soup.find(id = 'bps-result-region').find('div', class_ = 'directory_results_warning')['style'].split(';')

#Handle surplus results
if(surplusResults):
addListings(listings = listings, nameStr = nameStr + c)
else:
listings.extend(soup.find_all("article", class_ = "directory_item")[0:numResults])

def getListings(startChar = 'a', endChar = 'z', display = False):
listings = []
addListings(listings = listings, startChar = startChar, endChar = endChar, display = display)
return listings

import csv

def saveListingsToCSV(listings, filename='listings.csv'):
with open(filename, mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(['Listing Name']) # Header row
for listing in listings:
writer.writerow([listing])


# Collect listings
myListings = []
addListings(myListings)

# Save to CSV
saveListingsToCSV(myListings)

# Output the total number of listings found
print(f'Total listings found: {len(myListings)}')

# Close the Selenium driver
driver.quit()

#5620
121 changes: 121 additions & 0 deletions web-scraper/testscrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import requests
from bs4 import BeautifulSoup
import time
import json

url = "https://physics.yale.edu/people"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

faculty_data = []
count = 0

# Loop through each table row containing a listing
for row in soup.find_all("tr"):
name_cell = row.find("td", class_="views-field-name")
if not name_cell:
continue # Skip rows without a listing

# --- Name & profile link ---
name_tag = name_cell.find("a", class_="username")
name = name_tag.get_text(strip=True) if name_tag else None
profile_link = f"https://physics.yale.edu{name_tag['href']}" if name_tag else None

# --- Text content ---
text_parts = list(name_cell.stripped_strings)
# First entry is name
title = text_parts[1] if len(text_parts) > 1 else None
office = text_parts[2] if len(text_parts) > 2 else None

# --- Email ---
email_tag = name_cell.find("a", href=lambda x: x and x.startswith("mailto:"))
email = email_tag.get_text(strip=True) if email_tag else None

# --- Website ---
website_tag = name_cell.find("a", href=lambda x: x and x.startswith("http"))
website = website_tag['href'] if website_tag else None

# --- Phone numbers ---
phones = []
for t in text_parts:
# Match typical phone patterns
if "Phone:" in t or t.replace("-", "").strip().isdigit():
phones.append(t.replace("Phone:", "").strip())

# --- Picture ---
pic_cell = row.find("td", class_="views-field-picture")
img_tag = pic_cell.find("img") if pic_cell else None
image_url = img_tag['src'] if img_tag else None

# --- Field of study ---
study_cell = row.find("td", class_="views-field-field-field-of-study")
field_of_study = study_cell.get_text(strip=True) if study_cell else None

count += 1

# --- Scrape profile page for bio text ---
profile_bio = None
if profile_link:
try:
print(f"Scraping profile page {count}: {name} - {profile_link}...")
profile_response = requests.get(profile_link, timeout=10)
profile_soup = BeautifulSoup(profile_response.text, "html.parser")

# Look for the research narrative field
research_field = profile_soup.find("div", class_="field-name-field-research-narrative")
if research_field:
field_item = research_field.find("div", class_="field-item even")
if field_item:
profile_bio = field_item.get_text(separator=" ", strip=True)

time.sleep(0.5)
except Exception as e:
print(f"Error scraping profile page {profile_link}: {e}")
profile_bio = None

# --- Scrape external website for research/bio text ---
website_text = None
if website:
try:
print(f"Scraping website {count}: {name} - {website}...")
website_response = requests.get(website, timeout=10)
website_soup = BeautifulSoup(website_response.text, "html.parser")

# Extract text from the specific div with class "field-item even"
field_item = website_soup.find("div", class_="field-item even")
if field_item:
website_text = field_item.get_text(separator=" ", strip=True)
else:
# Fallback: try just "field-item" or "field-items"
field_item = website_soup.find("div", class_="field-item")
if field_item:
website_text = field_item.get_text(separator=" ", strip=True)

# Optional: be respectful with rate limiting
time.sleep(0.5)
except Exception as e:
print(f"Error scraping {website}: {e}")
website_text = None
else:
print(f"No website found for {name}")

faculty_data.append({
"name": name,
"profile_link": profile_link,
"title": title,
"office": office,
"email": email,
"phones": phones,
"website": website,
"image_url": image_url,
"field_of_study": field_of_study,
"profile_bio": profile_bio,
"website_text": website_text
})

# Save to JSON file
with open("faculty_data.json", "w", encoding="utf-8") as json_file:
json.dump(faculty_data, json_file, indent=2, ensure_ascii=False)

print(f"\nData saved to faculty_data.json")
print(f"Total faculty members scraped: {len(faculty_data)}")
Loading