nyuoss · jknaack · Mar 3, 2026
diff --git a/src/gradescopeapi/classes/_helpers/_download_helpers.py b/src/gradescopeapi/classes/_helpers/_download_helpers.py
@@ -0,0 +1,13 @@
+from pathlib import Path
+
+
+def download_url(session, url: str, output_path: Path) -> Path:
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    r = session.get(url, stream=True, allow_redirects=True)
+    r.raise_for_status()
+    with open(output_path, "wb") as f:
+        for chunk in r.iter_content(chunk_size=8192):
+            if chunk:
+                f.write(chunk)
+    return output_path
diff --git a/src/gradescopeapi/classes/_helpers/_export_helpers.py b/src/gradescopeapi/classes/_helpers/_export_helpers.py
@@ -0,0 +1,169 @@
+import json
+import time
+from urllib.parse import urljoin
+
+from bs4 import BeautifulSoup
+
+DEFAULT_POLL_INTERVAL = 1
+
+
+def get_submission_download_url(
+    base_url: str,
+    course_id: str,
+    assignment_id: str,
+    submission_id: str,
+) -> str:
+    return f"{base_url}/courses/{course_id}/assignments/{assignment_id}/submissions/{submission_id}.zip"
+
+
+def _review_grades_url(base_url: str, course_id: str, assignment_id: str) -> str:
+    return f"{base_url}/courses/{course_id}/assignments/{assignment_id}/review_grades"
+
+
+def _find_submission_zip_link_on_assignment_page(
+    soup: BeautifulSoup, base_url: str
+) -> str | None:
+    for a in soup.find_all("a", href=True):
+        href = a.get("href", "")
+        if "/submissions/" in href and href.endswith(".zip"):
+            return href if href.startswith("http") else urljoin(base_url, href)
+    return None
+
+
+def get_latest_submission_download_url(
+    session,
+    base_url: str,
+    course_id: str,
+    assignment_id: str,
+) -> str:
+    url = f"{base_url}/courses/{course_id}/assignments/{assignment_id}"
+    r = session.get(url)
+    r.raise_for_status()
+    soup = BeautifulSoup(r.text, "html.parser")
+    link = _find_submission_zip_link_on_assignment_page(soup, base_url)
+    if link:
+        return link
+    raise ValueError("No 'Download Submission' link found on the assignment page.")
+
+
+def _find_generated_file_link(soup: BeautifulSoup, base_url: str) -> str | None:
+    for a in soup.find_all("a", href=True):
+        href = a.get("href", "")
+        if "/generated_files/" in href and href.endswith(".zip"):
+            return href if href.startswith("http") else urljoin(base_url, href)
+    return None
+
+
+def _poll_generated_file_until_ready(
+    session,
+    base_url: str,
+    course_id: str,
+    file_id: int,
+    poll_interval: float = DEFAULT_POLL_INTERVAL,
+    poll_max: int | None = None,
+) -> str:
+    status_url = f"{base_url}/courses/{course_id}/generated_files/{file_id}.json"
+    headers = {"Accept": "application/json", "X-Requested-With": "XMLHttpRequest"}
+    attempt = 0
+    while poll_max is None or attempt < poll_max:
+        attempt += 1
+        r = session.get(status_url, headers=headers)
+        r.raise_for_status()
+        try:
+            data = r.json()
+        except json.JSONDecodeError:
+            raise ValueError(f"Unexpected response from {status_url}") from None
+        status = (data.get("status") or "").strip()
+        if status == "completed":
+            return f"{base_url}/courses/{course_id}/generated_files/{file_id}.zip"
+        if status not in ("processing", "unprocessed"):
+            raise ValueError(f"Export status: {status}")
+        time.sleep(poll_interval)
+    raise TimeoutError("Export did not complete in time.")
+
+
+def _get_export_csrf_and_headers(soup: BeautifulSoup) -> tuple[str | None, dict]:
+    """Return (csrf_token or None, headers) for the export request."""
+    csrf_meta = soup.find("meta", {"name": "csrf-token"})
+    token = (csrf_meta.get("content") if csrf_meta else None) or (
+        (soup.find("input", {"name": "authenticity_token"}) or {}).get("value")
+    )
+    headers = {
+        "Accept": "application/json, text/html",
+        "X-Requested-With": "XMLHttpRequest",
+    }
+    if token:
+        headers["X-CSRF-Token"] = token
+    return (token, headers)
+
+
+def _get_file_id_from_export_response(r) -> int | None:
+    """Parse export response; return generated file id if present."""
+    content_type = (r.headers.get("content-type") or "").strip().lower()
+    if "application/json" not in content_type:
+        return None
+    try:
+        data = r.json()
+        file_id = data.get("id") or data.get("generated_file_id")
+        if file_id is not None:
+            return int(file_id)
+    except (json.JSONDecodeError, ValueError, AttributeError, TypeError):
+        pass
+    return None
+
+
+def get_export_all_download_url(
+    session,
+    base_url: str,
+    course_id: str,
+    assignment_id: str,
+    poll_interval: float = DEFAULT_POLL_INTERVAL,
+    poll_max: int | None = None,
+) -> str:
+    """Return the URL of the assignment export submissions zip file. This is not an idempotent operation.
+
+    If a zip is already available (e.g. from a prior export), returns its URL
+    immediately. Otherwise triggers export, waits until the file exists on the
+    server, then returns that URL. Later calls may be faster or slower depending
+    on whether the file is still available.
+    """
+    submissions_url = _review_grades_url(base_url, course_id, assignment_id)
+    r = session.get(submissions_url)
+    r.raise_for_status()
+    soup = BeautifulSoup(r.text, "html.parser")
+    download_link = _find_generated_file_link(soup, base_url)
+    if download_link:
+        return download_link
+
+    export_url = f"{base_url}/courses/{course_id}/assignments/{assignment_id}/export"
+    token, headers = _get_export_csrf_and_headers(soup)
+    if token:
+        r = session.post(
+            export_url,
+            data={"authenticity_token": token},
+            headers=headers,
+            allow_redirects=False,
+        )
+    else:
+        r = session.get(export_url, headers=headers, allow_redirects=False)
+    r.raise_for_status()
+
+    # TODO: Before release, confirm whether export ever returns zip in response body.
+    # No logs or HAR evidence so far. Based on test results: uncomment zip-in-body block
+    # (restore return (None, r), return type tuple, branching in download_all_submissions)
+    # or remove it; and refine the ValueError message below if we see a specific failure shape.
+    # if r.headers.get("content-type", "").strip().startswith("application/zip"):
+    #     return (None, r)
+
+    file_id = _get_file_id_from_export_response(r)
+    if file_id is not None:
+        return _poll_generated_file_until_ready(
+            session,
+            base_url,
+            course_id,
+            file_id,
+            poll_interval=poll_interval,
+            poll_max=poll_max,
+        )
+
+    raise ValueError("Export request did not return a usable response.")
diff --git a/src/gradescopeapi/classes/_helpers/_submission_helpers.py b/src/gradescopeapi/classes/_helpers/_submission_helpers.py
@@ -0,0 +1,191 @@
+"""Parse the review_grades submissions table into structured data."""
+
+from datetime import datetime
+from urllib.parse import urljoin
+from zoneinfo import ZoneInfo
+
+from bs4 import BeautifulSoup
+
+_UTC = ZoneInfo("UTC")
+
+# <time datetime="2025-09-22 08:00:55 -0400"> - offset is in the string.
+_DATETIME_FMT = "%Y-%m-%d %H:%M:%S %z"
+
+
+def _parse_submitted_at(cell) -> datetime | None:
+    """Parse submission time from <time datetime="..."> (offset in string). Return UTC or None."""
+    if cell is None:
+        return None
+    time_tag = getattr(cell, "find", lambda *a, **k: None)("time")
+    if time_tag is None:
+        return None
+    dt_attr = time_tag.get("datetime")
+    if not dt_attr:
+        return None
+    try:
+        dt = datetime.strptime(dt_attr.strip(), _DATETIME_FMT)
+        return dt.astimezone(_UTC)
+    except ValueError:
+        return None
+
+
+def _parse_sections_cell(cell) -> list[str]:
+    """
+    Sections from DOM: .sectionsColumnCell--sectionSpan (one section per span).
+    If no spans, return [].
+    """
+    if cell is None:
+        return []
+    spans = getattr(cell, "select", lambda *a: [])(".sectionsColumnCell--sectionSpan")
+    if not spans:
+        return []
+    return [s.get_text(strip=True) for s in spans if s.get_text(strip=True)]
+
+
+def _has_icon(cell, class_substring: str) -> bool:
+    """True if cell contains an <i> whose class list includes class_substring (e.g. 'fa-check')."""
+    if cell is None:
+        return False
+    def _class_contains(c):
+        if not c:
+            return False
+        return class_substring in (c if isinstance(c, list) else c.split())
+    icon = cell.find("i", class_=_class_contains)
+    return icon is not None
+
+
+def _is_graded(cell) -> bool:
+    """True if cell contains i.fa-check (Submission is graded). No fa-* for false in our HTML samples."""
+    return _has_icon(cell, "fa-check")
+
+
+def _is_viewed(cell) -> bool:
+    """True if cell contains i.fa-eye (Submission has been viewed). False = no fa-eye (e.g. statusIcon-inactive + 'Submission has not been viewed.')."""
+    return _has_icon(cell, "fa-eye")
+
+
+def _is_canvas_linked(cell) -> bool:
+    """True if cell contains i.fa-link (Linked to Canvas), False if i.fa-unlink (No Canvas link)."""
+    if cell is None:
+        return False
+    if _has_icon(cell, "fa-unlink"):
+        return False
+    return _has_icon(cell, "fa-link")
+
+
+def _is_late(cell) -> bool:
+    """True if the time cell contains the late badge."""
+    if cell is None:
+        return False
+    return cell.select_one(".lateSubmissionBadge") is not None
+
+
+def _score_from_cell(cell) -> float | None:
+    """Parse score cell text as float; None if missing or not a number."""
+    if cell is None:
+        return None
+    text = getattr(cell, "get_text", lambda *a: "")(" ", strip=True)
+    if not text or "doesn't have a submission" in text.lower():
+        return None
+    try:
+        return float(text)
+    except ValueError:
+        return None
+
+
+def _row_cells_to_dict(tr, base_url: str) -> dict | None:
+    """
+    Infer each field from the row's cells using DOM structure (tags, classes, attributes).
+    Returns a parsed row dict, or None if the row has no cells.
+    """
+    cells = tr.find_all("td")
+    if not cells:
+        return None
+
+    # Submission link and student_name: first <a> under the first td.table--primaryLink (First Last column; second is Last, First)
+    primary_cells = tr.find_all("td", class_=lambda c: c and "table--primaryLink" in (c if isinstance(c, list) else c.split()))
+    first_primary = primary_cells[0] if primary_cells else None
+    link = first_primary.find("a", href=True) if first_primary else None
+    submission_url: str | None = None
+    if link and "/submissions/" in link.get("href", ""):
+        href = link.get("href", "")
+        submission_url = href if href.startswith("http") else urljoin(base_url, href)
+    # Name: link text when we have a submission; else first td.table--primaryLink; when there's no submission that cell has no class, so fall back to first cell
+    student_name = (
+        link.get_text() if (link and submission_url)
+        else (first_primary.get_text() if first_primary else (cells[0].get_text() if cells else None))
+    )
+
+    # Email: <td> containing <a href="mailto:...">
+    mailto = tr.find("a", href=lambda h: h and h.startswith("mailto:"))
+    email_cell = mailto.find_parent("td") if mailto else None
+
+    # Sections: <td> containing .sectionsColumnCell
+    sections_cell = next((td for td in cells if td.select_one(".sectionsColumnCell")), None)
+    if sections_cell is None:
+        # Fallback: cell with comma+slash (section-like content)
+        for td in cells:
+            text = td.get_text(" ", strip=True)
+            if "," in text and "/" in text:
+                sections_cell = td
+                break
+
+    # Time: <td> containing <time>
+    time_cell = next((td for td in cells if td.find("time")), None)
+
+    # Score, graded, viewed, canvas: find by DOM/content
+    score_cell = None
+    graded_cell = None
+    viewed_cell = None
+    canvas_cell = None
+    for td in cells:
+        text = td.get_text(" ", strip=True)
+        if _is_graded(td):
+            graded_cell = td
+        elif _is_viewed(td) or "not been viewed" in text.lower():
+            viewed_cell = td
+        elif _is_canvas_linked(td):
+            canvas_cell = td
+        elif _score_from_cell(td) is not None or "doesn't have a submission" in text.lower():
+            score_cell = td
+
+    return {
+        "submission_url": submission_url,
+        "student_name": student_name,
+        "email": email_cell.get_text() if email_cell else None,
+        "sections": _parse_sections_cell(sections_cell),
+        "score": _score_from_cell(score_cell) if submission_url and score_cell else None,
+        "graded": _is_graded(graded_cell) if submission_url else False,
+        "viewed": _is_viewed(viewed_cell) if submission_url else False,
+        "canvas_linked": _is_canvas_linked(canvas_cell) if submission_url else False,
+        "submitted_at": _parse_submitted_at(time_cell) if submission_url else None,
+        "late": _is_late(time_cell) if submission_url else False,
+    }
+
+
+def parse_submissions_table(
+    soup: BeautifulSoup,
+    base_url: str,
+    course_id: str,
+    assignment_id: str,
+) -> list[dict]:
+    """
+    Parse the review_grades submissions table into a list of parsed row dicts.
+
+    Each row is interpreted by inspecting cell content/structure (no header lookup).
+    Each dict: submission_url (str | None; null = no submission), student_name, email, sections,
+    score (float | None), graded, viewed, canvas_linked (bool), submitted_at (datetime | None), late (bool).
+    """
+    table = soup.find("table", id="submissions-table") or soup.find("table")
+    if not table:
+        return []
+
+    tbody = table.find("tbody")
+    trs = tbody.find_all("tr") if tbody else table.find_all("tr")[1:]
+    rows_data: list[dict] = []
+    for tr in trs:
+        row = _row_cells_to_dict(tr, base_url)
+        if row is not None and row["student_name"] and row["email"]:
+            rows_data.append(row)
+
+    return rows_data