YaleComputerSociety · propeter35 · Jan 22, 2026
diff --git a/faculty_data.json b/faculty_data.json
diff --git a/web-scraper/History_WebScraper.py b/web-scraper/History_WebScraper.py
@@ -0,0 +1,147 @@
+import requests
+from bs4 import BeautifulSoup
+import time
+import json
+from urllib.parse import urljoin
+
+BASE_URL = "https://history.yale.edu"
+FACULTY_URL = "https://history.yale.edu/people/faculty"
+
+HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/118.0.0.0 Safari/537.36"
+    )
+}
+
+# Fetch HTML & parse
+def get_soup(url):
+    try:
+        r = requests.get(url, headers=HEADERS, timeout=15)
+        r.raise_for_status()
+        return BeautifulSoup(r.text, "html.parser")
+    except Exception as e:
+        print(f"Error fetching {url}: {e}")
+        return None
+
+def get_faculty_list():
+    faculty = []
+    page = 0
+
+    while True:
+        url = f"{FACULTY_URL}?page={page}"
+        print("Loading list page:", url)
+
+        soup = get_soup(url)
+        if not soup:
+            break
+
+        rows = soup.select("table.views-table tbody tr")
+        if not rows:
+            break  # No more pages
+
+        for row in rows:
+            # Name + profile URL
+            name_link = row.select_one("td.views-field-name a")
+            if not name_link:
+                continue
+
+            name = name_link.get_text(strip=True)
+            profile_url = urljoin(BASE_URL, name_link["href"])
+
+            # Fields of interest
+            interest_cell = row.select_one(
+                "td.views-field-field-field-s-of-interest"
+            )
+            fields_of_interest = (
+                interest_cell.get_text(" ", strip=True)
+                if interest_cell
+                else None
+            )
+
+            faculty.append({
+                "name": name,
+                "profile_url": profile_url,
+                "fields_of_interest": fields_of_interest
+            })
+
+        page += 1
+        time.sleep(0.5)
+
+    return faculty
+
+# Extract full bio
+def extract_full_bio(profile_url):
+    soup = get_soup(profile_url)
+    if not soup:
+        return None
+
+    # Find the bio section - it's in a div with a label "Bio:"
+    bio_parts = []
+
+    # Look for all field items in the main content area
+    # The bio appears after the "Bio:" label in the field content
+    labels = soup.find_all("div", class_="field-label")
+
+    for label in labels:
+        if label.get_text(strip=True) == "Bio:":
+            # Get the next sibling which should be the field-items div
+            field_items = label.find_next_sibling("div", class_="field-items")
+            if field_items:
+                # Extract all text, preserving paragraph breaks
+                paragraphs = field_items.find_all("p")
+                if paragraphs:
+                    for p in paragraphs:
+                        text = p.get_text(strip=True)
+                        if text:
+                            bio_parts.append(text)
+                else:
+                    # If no paragraphs, just get all text
+                    text = field_items.get_text(strip=True)
+                    if text:
+                        bio_parts.append(text)
+                break
+
+    # Also check for publications and awards if present
+    publications = []
+    for label in soup.find_all("strong"):
+        if "Publications" in label.get_text():
+            parent = label.find_parent()
+            if parent:
+                items = parent.find_all("li")
+                for item in items:
+                    publications.append(item.get_text(strip=True))
+
+    # Join bio parts
+    bio_text = "\n\n".join(bio_parts) if bio_parts else None
+
+    return bio_text
+# Scrape all faculty
+def scrape_all():
+    faculty = get_faculty_list()
+    print(f"Found {len(faculty)} faculty")
+
+    results = []
+
+    for i, person in enumerate(faculty, 1):
+        print(f"[{i}/{len(faculty)}] Scraping {person['name']}")
+        bio = extract_full_bio(person["profile_url"])
+
+        results.append({
+            "name": person["name"],
+            "department": "History",
+            "profile_url": person["profile_url"],
+            "fields_of_interest": person["fields_of_interest"],
+            "bio": bio
+        })
+
+        time.sleep(0.5)  # polite scraping
+
+    with open("yale_history_faculty.json", "w", encoding="utf-8") as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+
+    print(f"Saved yale_history_faculty.json with {len(results)} faculty members")
+
+if __name__ == "__main__":
+    scrape_all()
diff --git a/web-scraper/Web_Scraper.py b/web-scraper/Web_Scraper.py
@@ -0,0 +1,78 @@
+"""
+Scrapes the yale directory website in order to find information about all listed professors
+Eventually will create default RDB listings for all professors on the directory website
+"""
+
+import requests
+import time
+from selenium import webdriver
+from bs4 import BeautifulSoup
+from string import ascii_lowercase
+
+BASE_URL = "https://physics.yale.edu/people"
+
+driver = webdriver.Chrome()
+
+def getURL(lastName):
+    return BASE_URL + lastName
+
+def getSite(lastName, maxSearchDuration = 3):
+    driver.get(getURL(lastName))
+    searchDuration = 0
+    while(((getSoup(driver.page_source).find(id = 'loading-indicator') == None) | ('inline' in getSoup(driver.page_source).find(id = 'loading-indicator')['style'])) & (searchDuration < maxSearchDuration)):
+        time.sleep(0.1)
+        searchDuration += 0.1
+    return driver.page_source
+
+def getSoup(site):
+    return BeautifulSoup(site, 'html.parser')
+
+def addListings(listings, nameStr = '', startChar = 'a', endChar = 'c', display = False):
+    for c in ascii_lowercase[ascii_lowercase.index(startChar):(ascii_lowercase.index(endChar) + 1)]:
+        soup = getSoup(getSite(nameStr + c))
+
+        resultsText = soup.find(id = 'results-people-header').text
+
+        numResults = int(resultsText.split(' ')[0]) if resultsText.split(' ')[0].isdigit() else 1 if 'display: none' in soup.find(id = 'bps-result-region')['style'].split(';') else 0
+
+        if(display):
+            if(numResults == 25):
+                print(f'Searching "{nameStr + c}"... Found {numResults} results')
+
+        surplusResults = numResults != 1 and 'display: block' in soup.find(id = 'bps-result-region').find('div', class_ = 'directory_results_warning')['style'].split(';')
+
+        #Handle surplus results
+        if(surplusResults):
+            addListings(listings = listings, nameStr = nameStr + c)
+        else:
+            listings.extend(soup.find_all("article", class_ = "directory_item")[0:numResults])
+
+def getListings(startChar = 'a', endChar = 'z', display = False):
+    listings = []
+    addListings(listings = listings, startChar = startChar, endChar = endChar, display = display)
+    return listings
+
+import csv
+
+def saveListingsToCSV(listings, filename='listings.csv'):
+    with open(filename, mode='w', newline='', encoding='utf-8') as file:
+        writer = csv.writer(file)
+        writer.writerow(['Listing Name'])  # Header row
+        for listing in listings:
+            writer.writerow([listing])
+
+
+# Collect listings
+myListings = []
+addListings(myListings)
+
+# Save to CSV
+saveListingsToCSV(myListings)
+
+# Output the total number of listings found
+print(f'Total listings found: {len(myListings)}')
+
+# Close the Selenium driver
+driver.quit()
+
+#5620
diff --git a/web-scraper/testscrape.py b/web-scraper/testscrape.py
@@ -0,0 +1,121 @@
+import requests
+from bs4 import BeautifulSoup
+import time
+import json
+
+url = "https://physics.yale.edu/people"
+response = requests.get(url)
+soup = BeautifulSoup(response.text, "html.parser")
+
+faculty_data = []
+count = 0
+
+# Loop through each table row containing a listing
+for row in soup.find_all("tr"):
+    name_cell = row.find("td", class_="views-field-name")
+    if not name_cell:
+        continue  # Skip rows without a listing
+
+    # --- Name & profile link ---
+    name_tag = name_cell.find("a", class_="username")
+    name = name_tag.get_text(strip=True) if name_tag else None
+    profile_link = f"https://physics.yale.edu{name_tag['href']}" if name_tag else None
+
+    # --- Text content ---
+    text_parts = list(name_cell.stripped_strings)
+    # First entry is name
+    title = text_parts[1] if len(text_parts) > 1 else None
+    office = text_parts[2] if len(text_parts) > 2 else None
+
+    # --- Email ---
+    email_tag = name_cell.find("a", href=lambda x: x and x.startswith("mailto:"))
+    email = email_tag.get_text(strip=True) if email_tag else None
+
+    # --- Website ---
+    website_tag = name_cell.find("a", href=lambda x: x and x.startswith("http"))
+    website = website_tag['href'] if website_tag else None
+
+    # --- Phone numbers ---
+    phones = []
+    for t in text_parts:
+        # Match typical phone patterns
+        if "Phone:" in t or t.replace("-", "").strip().isdigit():
+            phones.append(t.replace("Phone:", "").strip())
+
+    # --- Picture ---
+    pic_cell = row.find("td", class_="views-field-picture")
+    img_tag = pic_cell.find("img") if pic_cell else None
+    image_url = img_tag['src'] if img_tag else None
+
+    # --- Field of study ---
+    study_cell = row.find("td", class_="views-field-field-field-of-study")
+    field_of_study = study_cell.get_text(strip=True) if study_cell else None
+
+    count += 1
+
+    # --- Scrape profile page for bio text ---
+    profile_bio = None
+    if profile_link:
+        try:
+            print(f"Scraping profile page {count}: {name} - {profile_link}...")
+            profile_response = requests.get(profile_link, timeout=10)
+            profile_soup = BeautifulSoup(profile_response.text, "html.parser")
+
+            # Look for the research narrative field
+            research_field = profile_soup.find("div", class_="field-name-field-research-narrative")
+            if research_field:
+                field_item = research_field.find("div", class_="field-item even")
+                if field_item:
+                    profile_bio = field_item.get_text(separator=" ", strip=True)
+
+            time.sleep(0.5)
+        except Exception as e:
+            print(f"Error scraping profile page {profile_link}: {e}")
+            profile_bio = None
+
+    # --- Scrape external website for research/bio text ---
+    website_text = None
+    if website:
+        try:
+            print(f"Scraping website {count}: {name} - {website}...")
+            website_response = requests.get(website, timeout=10)
+            website_soup = BeautifulSoup(website_response.text, "html.parser")
+
+            # Extract text from the specific div with class "field-item even"
+            field_item = website_soup.find("div", class_="field-item even")
+            if field_item:
+                website_text = field_item.get_text(separator=" ", strip=True)
+            else:
+                # Fallback: try just "field-item" or "field-items"
+                field_item = website_soup.find("div", class_="field-item")
+                if field_item:
+                    website_text = field_item.get_text(separator=" ", strip=True)
+
+            # Optional: be respectful with rate limiting
+            time.sleep(0.5)
+        except Exception as e:
+            print(f"Error scraping {website}: {e}")
+            website_text = None
+    else:
+        print(f"No website found for {name}")
+
+    faculty_data.append({
+        "name": name,
+        "profile_link": profile_link,
+        "title": title,
+        "office": office,
+        "email": email,
+        "phones": phones,
+        "website": website,
+        "image_url": image_url,
+        "field_of_study": field_of_study,
+        "profile_bio": profile_bio,
+        "website_text": website_text
+    })
+
+# Save to JSON file
+with open("faculty_data.json", "w", encoding="utf-8") as json_file:
+    json.dump(faculty_data, json_file, indent=2, ensure_ascii=False)
+
+print(f"\nData saved to faculty_data.json")
+print(f"Total faculty members scraped: {len(faculty_data)}")