diff --git a/connectors/sources/sharepoint_metadata_enricher.py b/connectors/sources/sharepoint_metadata_enricher.py new file mode 100644 index 000000000..47e10445d --- /dev/null +++ b/connectors/sources/sharepoint_metadata_enricher.py @@ -0,0 +1,544 @@ +import os +from typing import Dict, List, Optional, Any + +# ODC-specific managed properties for SharePoint Graph API calls +# Includes both ODC and ODP (Operations Document Portal) fields +ODC_MANAGED_PROPERTIES = "ActivityID,BusinessUnit,OPDCategory,LOB,Division,DocumentType,FinancialYear,Quarter,Month,Owner,Reviewer,Approver,Status,Priority,Confidentiality,Retention,Compliance,RelatedProjects,Tags,Keywords,Notes,ProjectID,ProjectName,ProjectTitle,ProjectType,ProjectSector,Phase,Region,Country,FocusCountry,GrantType,GrantWindow,GrantRecipient,Theme,ShortName,AllDocuments,Disclosable,Disclosed,NonIFAD,PLF,Sensitive,BorrowerID,CopyValidation,DocumentTypeID,ODCIntegration_CIMission" + +# Graph API URL constant +GRAPH_API_URL = "https://graph.microsoft.com/v1.0" + + +class SharePointMetadataEnricher: + + def __init__(self, logger=None, graph_api_client=None): + self.logger = logger + self._graph_api_client = graph_api_client + + def _log_debug(self, message: str): + if self.logger: + self.logger.debug(message) + + def _log_info(self, message: str): + if self.logger: + self.logger.info(message) + + def _log_warning(self, message: str): + if self.logger: + self.logger.warning(message) + + def _is_odc_site(self, site): + """Check if site is an ODC site based on the official ODC site URLs.""" + if not site: + return False + + web_url = site.get("webUrl", "").lower() + + # Official ODC site URLs + odc_sites = [ + "aprop", + "lacop", + "esaop", + "nenop", + "wcaop", + "epop" + ] + + # Check if the site URL matches any ODC site (with or without trailing slash) + for odc_site in odc_sites: + if f"/sites/{odc_site.rstrip('/')}" in web_url: + return True + + return False + + def get_odc_managed_properties(self): + """Get the ODC managed properties string for Graph API calls.""" + return ODC_MANAGED_PROPERTIES + + def should_include_odc_properties(self, site): + """Check if ODC properties should be included in Graph API calls for this site.""" + return self._is_odc_site(site) + + async def get_drive_list_mapping(self, site_id, site_drives_method, site_lists_method): + """ + Get mapping between drives and their corresponding SharePoint lists. + This is needed to fetch custom metadata for drive items. + """ + drive_list_mapping = {} + + try: + # Get all drives for the site + async for drive in site_drives_method(site_id): + drive_id = drive.get("id") + + # Get all lists for the site + async for site_list in site_lists_method(site_id): + list_id = site_list.get("id") + list_name = site_list.get("name") or site_list.get("displayName", "") + + # Try to match drive with list - document libraries are usually lists + # This is a heuristic approach - in reality the mapping can be complex + if ( + "document" in list_name.lower() or + "library" in list_name.lower() or + list_name.lower() in drive.get("name", "").lower() + ): + drive_list_mapping[drive_id] = list_id + self._log_info(f"Mapped drive '{drive.get('name')}' ({drive_id}) to list '{list_name}' ({list_id})") + break + + except Exception as e: + self._log_warning(f"Error creating drive-list mapping for site {site_id}: {str(e)}") + + return drive_list_mapping + + async def get_drive_item_list_fields(self, drive_id, item_id): + """ + Get custom metadata fields for a drive item via the listItem/fields endpoint. + This is the working approach that retrieves SharePoint custom metadata. + """ + if not self._graph_api_client: + self._log_warning("No Graph API client available for fetching metadata fields") + return {} + + try: + url = f"{GRAPH_API_URL}/drives/{drive_id}/items/{item_id}/listItem/fields" + response = await self._graph_api_client.fetch(url) + self._log_info(f"Retrieved {len(response)} custom fields for drive item {item_id}") + return response + + except Exception as e: + if "404" in str(e) or "NotFound" in str(e): + # Item might be deleted or not linked to a listItem + self._log_debug(f"No listItem/fields found for drive item {item_id} (404)") + else: + self._log_debug(f"Failed to get listItem/fields for item {item_id}: {str(e)}") + return {} + + async def enrich_drive_item_with_list_metadata(self, drive_item, site_id=None, drive_list_mapping=None): + """ + Enrich a drive item with custom metadata using the working listItem/fields approach. + """ + try: + # Get the drive ID and item ID + item_id = drive_item.get("id") + drive_id = None + + # Try to get drive ID from parentReference + parent_ref = drive_item.get("parentReference", {}) + if parent_ref: + drive_id = parent_ref.get("driveId") + + if not drive_id or not item_id: + self._log_debug(f"Missing drive_id or item_id for drive item {item_id}") + return drive_item + + # Use the working approach: /drives/{drive_id}/items/{item_id}/listItem/fields + custom_fields = await self.get_drive_item_list_fields(drive_id, item_id) + + if custom_fields: + enriched_item = drive_item.copy() + # Add the SharePoint list fields to the drive item + enriched_item["fields"] = custom_fields + self._log_info(f"Enriched drive item {item_id} with {len(custom_fields)} SharePoint fields from listItem/fields") + return enriched_item + else: + self._log_debug(f"No SharePoint listItem/fields found for drive item {item_id}") + + except Exception as e: + self._log_warning(f"Error enriching drive item {drive_item.get('id')} with listItem/fields metadata: {str(e)}") + + return drive_item + + def _extract_metadata_from_sharepoint_fields( + self, + document: Dict[str, Any], + site: Optional[Dict[str, Any]] = None, + site_drive: Optional[Dict[str, Any]] = None, + site_list: Optional[Dict[str, Any]] = None + ) -> Dict[str, Any]: + self._log_info(f"Extracting metadata for document {document.get('_id', 'unknown')}") + + metadata = {} + fields = document.get("fields", {}) + + # Helper function to extract value from complex SharePoint field structures + def extract_field_value(field_data): + if isinstance(field_data, dict): + # Handle managed metadata fields with Label/TermGuid structure + if "Label" in field_data: + return field_data["Label"] + # Handle lookup fields or other object structures + elif "DisplayName" in field_data: + return field_data["DisplayName"] + elif "Value" in field_data: + return field_data["Value"] + elif isinstance(field_data, list): + # Handle arrays of managed metadata or lookup fields + if field_data and isinstance(field_data[0], dict): + return [extract_field_value(item) for item in field_data] + else: + return field_data + else: + # Handle simple string/number/boolean values + return field_data + return None + + # Determine category from site URL using ODC site detection + if site and site.get("webUrl"): + if self._is_odc_site(site): + metadata["Category"] = "ODC" + else: + site_url = site["webUrl"].lower() + if "xdesk" in site_url: + metadata["Category"] = "Xdesk" + else: + metadata["Category"] = "General" + else: + metadata["Category"] = None + + # Core business metadata + metadata["Division"] = fields.get("BusinessUnit") + metadata["Department"] = fields.get("BusinessUnit") + + # Document type - handle both simple and complex field structures + doc_type = extract_field_value(fields.get("DocumentType")) + metadata["Content-Type"] = doc_type or self._determine_content_type(document) + + # Project information - handle complex field structures + project_id = extract_field_value(fields.get("ProjectID")) + if project_id: + metadata["ProjectID"] = project_id + else: + # Fallback to hidden field + metadata["ProjectID"] = fields.get("ProjectID_Hidden") + + metadata["ProjectType"] = fields.get("ProjectType") + metadata["ProjectName"] = fields.get("ProjectName") + metadata["ProjectTitle"] = fields.get("ProjectTitle") + metadata["ProjectSector"] = fields.get("ProjectSector") + + # Handle ShortName and AllDocuments (project short names) + short_name = extract_field_value(fields.get("ShortName")) + all_documents = extract_field_value(fields.get("AllDocuments")) + metadata["ShortName"] = short_name + metadata["AllDocuments"] = all_documents + + # Geographic and temporal metadata + metadata["Region"] = fields.get("Region") + metadata["Country"] = fields.get("Country") + metadata["CountryID"] = fields.get("CountryID") + metadata["FocusCountryIDs"] = fields.get("FocusCountryIDs") + + # Handle complex FocusCountry field + focus_country = extract_field_value(fields.get("FocusCountry")) + metadata["FocusCountry"] = focus_country + + metadata["Year"] = fields.get("Year") + metadata["Phase"] = fields.get("Phase") + metadata["PhaseID"] = fields.get("PhaseID") + + # Grant and financing information + metadata["GrantType"] = fields.get("GrantType") + metadata["GrantWindow"] = fields.get("GrantWindow") + grant_recipient = extract_field_value(fields.get("GrantRecipient")) + metadata["GrantRecipient"] = grant_recipient + metadata["BorrowerID"] = fields.get("BorrowerID") + + # Themes and topics + themes = extract_field_value(fields.get("Theme")) + metadata["Theme"] = themes + + # Document classification and status + metadata["Disclosable"] = fields.get("Disclosable") + metadata["Disclosed"] = fields.get("Disclosed") + metadata["NonIFAD"] = fields.get("NonIfad") + metadata["PLF"] = fields.get("PLF") + metadata["Sensitive"] = fields.get("Sensitive") + metadata["IsInDocSet"] = fields.get("IsInDocSet") + metadata["OPDIsLink"] = fields.get("OPDIsLink") + + # Validation and compliance + metadata["CopyValidation"] = fields.get("CopyValidation") + metadata["SentToRMS"] = fields.get("SentToRMS") + + # System and integration fields + metadata["ODCIntegration_CIMission"] = fields.get("ODCIntegration_CIMission") + metadata["SystemSource"] = fields.get("ODCIntegration_SystemSource") + metadata["DocumentTypeID"] = fields.get("DocumentTypeID") + + # Project reference field + metadata["Project"] = fields.get("Project") + + # Additional common SharePoint fields + metadata["Title"] = fields.get("Title") + metadata["Author"] = fields.get("Author") + metadata["Editor"] = fields.get("Editor") + metadata["Created"] = fields.get("Created") + metadata["Modified"] = fields.get("Modified") + metadata["FileLeafRef"] = fields.get("FileLeafRef") + metadata["FileDirRef"] = fields.get("FileDirRef") + metadata["ContentType"] = fields.get("ContentType") + metadata["FileType"] = fields.get("File_x0020_Type") + + # Document icon and size + metadata["DocIcon"] = fields.get("DocIcon") + metadata["FileSizeDisplay"] = fields.get("FileSizeDisplay") + + # Document ID and linking + dlc_doc_id = fields.get("_dlc_DocIdUrl") + if isinstance(dlc_doc_id, dict) and "Description" in dlc_doc_id: + metadata["DocumentID"] = dlc_doc_id["Description"] + metadata["DocumentIDUrl"] = dlc_doc_id.get("Url") + + # Version information + metadata["UIVersionString"] = fields.get("_UIVersionString") + + # Activity information (for ODC compatibility) + metadata["ActivityID"] = fields.get("ActivityID") + metadata["ActivityName"] = fields.get("ActivityName") + + # Legacy status field handling + metadata["Status"] = fields.get("OPDStatus") or fields.get("Status") + + # Check for any OPD/ODC category fields + odc_category = fields.get("OPDCategory") or fields.get("ODCCategory") + if odc_category: + metadata["Category"] = odc_category + + self._log_info(f"Extracted metadata with {len([v for v in metadata.values() if v is not None])} non-null fields") + return metadata + + def _determine_content_type(self, document: Dict[str, Any]) -> str: + object_type = document.get("object_type", "") + + if object_type == "drive_item": + name = document.get("name", "") + if "folder" in document: + return "Folder" + elif name: + ext = os.path.splitext(name)[-1].lower() + if ext in ['.ppt', '.pptx']: + return "Presentation" + elif ext in ['.doc', '.docx', '.pdf']: + return "Document" + elif ext in ['.xls', '.xlsx']: + return "Spreadsheet" + elif ext in ['.mp4', '.avi', '.mov']: + return "Video" + elif ext in ['.jpg', '.jpeg', '.png', '.gif']: + return "Image" + else: + return "Document" + else: + return "Document" + elif object_type == "site_page": + return "Web Page" + elif object_type == "list_item": + return "List Item" + elif object_type == "list_item_attachment": + return "Attachment" + else: + return "Document" + + def _site_path_from_web_url(self, web_url: str) -> str: + url_parts = web_url.split("/sites/") + site_path_parts = url_parts[1:] + return "/sites/".join(site_path_parts) + + def build_metadata_array( + self, + document: Dict[str, Any], + site: Optional[Dict[str, Any]] = None, + site_drive: Optional[Dict[str, Any]] = None, + site_list: Optional[Dict[str, Any]] = None + ) -> List[Dict[str, Any]]: + self._log_info(f"Building metadata array for document {document.get('_id', 'unknown')}") + metadata_pairs = [] + + try: + # Extract SharePoint-specific metadata + sharepoint_metadata = self._extract_metadata_from_sharepoint_fields( + document, site, site_drive, site_list + ) + + # Standard metadata that should always be present + metadata_pairs.append({ + "key": "Category", + "value": sharepoint_metadata.get("Category") + }) + + # Site Name + site_name = None + if site: + site_name = site.get("displayName") or site.get("name") or site.get("title") + metadata_pairs.append({"key": "Site Name", "value": site_name}) + + # Document Library / Drive Name + library_name = None + if site_drive: + library_name = site_drive.get("name") or site_drive.get("displayName") + elif site_list: + library_name = site_list.get("name") or site_list.get("displayName") + metadata_pairs.append({"key": "Document Library", "value": library_name}) + + # Division and Department (required fields) + metadata_pairs.append({ + "key": "Division", + "value": sharepoint_metadata.get("Division") + }) + metadata_pairs.append({ + "key": "Department", + "value": sharepoint_metadata.get("Department") + }) + + # Content-Type (required field) + metadata_pairs.append({ + "key": "Content-Type", + "value": sharepoint_metadata.get("Content-Type") + }) + + # File Type/Extension + file_extension = None + file_name = document.get("name") or document.get("_original_filename") or document.get("FileName", "") + if file_name and "." in file_name: + file_extension = os.path.splitext(file_name)[-1].lower() + metadata_pairs.append({"key": "File Type", "value": file_extension}) + + # File Path/Location + file_path = None + if document.get("webUrl"): + file_path = document["webUrl"] + elif document.get("parentReference", {}).get("path"): + file_path = document["parentReference"]["path"] + elif site and site.get("webUrl"): + site_path = self._site_path_from_web_url(site["webUrl"]) + if file_name: + file_path = f"{site_path}/{file_name}" + else: + file_path = site_path + metadata_pairs.append({"key": "File Path", "value": file_path}) + + # Add all SharePoint metadata fields for all documents + sharepoint_fields = [ + # Project and Activity Information + "ActivityID", "ActivityName", "ProjectID", "ProjectType", + "ProjectName", "ProjectTitle", "ProjectSector", "Project", + "ShortName", "AllDocuments", + + # Geographic and Temporal + "Region", "Country", "CountryID", "FocusCountry", "FocusCountryIDs", + "Year", "Phase", "PhaseID", + + # Grant and Financing + "GrantType", "GrantWindow", "GrantRecipient", "BorrowerID", + + # Themes and Classification + "Theme", "Status", "DocumentTypeID", + + # Flags and Status + "Disclosable", "Disclosed", "NonIFAD", "PLF", "Sensitive", + "IsInDocSet", "OPDIsLink", + + # Validation and Compliance + "CopyValidation", "SentToRMS", + + # System and Integration + "SystemSource", "ODCIntegration_CIMission", + + # Standard SharePoint Fields + "Title", "Author", "Editor", "Created", "Modified", + "FileLeafRef", "FileDirRef", "ContentType", "FileType", + "DocIcon", "FileSizeDisplay", "DocumentID", "DocumentIDUrl", + "UIVersionString" + ] + + for field in sharepoint_fields: + if field in sharepoint_metadata and sharepoint_metadata[field] is not None: + metadata_pairs.append({ + "key": field, + "value": sharepoint_metadata[field] + }) + + # Additional technical metadata + metadata_pairs.append({"key": "Object Type", "value": document.get("object_type")}) + metadata_pairs.append({"key": "Document ID", "value": document.get("_id")}) + metadata_pairs.append({ + "key": "Last Modified", + "value": document.get("_timestamp") or document.get("lastModifiedDateTime") + }) + + # Size information for files + if document.get("size"): + metadata_pairs.append({"key": "File Size", "value": document.get("size")}) + + # Creator information + created_by = None + if document.get("createdBy", {}).get("user", {}).get("displayName"): + created_by = document["createdBy"]["user"]["displayName"] + elif document.get("createdBy", {}).get("user", {}).get("email"): + created_by = document["createdBy"]["user"]["email"] + metadata_pairs.append({"key": "Created By", "value": created_by}) + + # Modified by information + modified_by = None + if document.get("lastModifiedBy", {}).get("user", {}).get("displayName"): + modified_by = document["lastModifiedBy"]["user"]["displayName"] + elif document.get("lastModifiedBy", {}).get("user", {}).get("email"): + modified_by = document["lastModifiedBy"]["user"]["email"] + metadata_pairs.append({"key": "Modified By", "value": modified_by}) + + self._log_info(f"Built {len(metadata_pairs)} metadata pairs") + + except Exception as e: + self._log_warning(f"Error building metadata array for document {document.get('_id')}: {str(e)}") + # Return minimal metadata on error + metadata_pairs = [ + {"key": "Category", "value": None}, + {"key": "Site Name", "value": None}, + {"key": "Document Library", "value": None}, + {"key": "Division", "value": None}, + {"key": "Department", "value": None}, + {"key": "Content-Type", "value": None}, + {"key": "File Type", "value": None}, + {"key": "File Path", "value": None} + ] + + return metadata_pairs + + def enrich_document_with_metadata( + self, + document: Dict[str, Any], + site: Optional[Dict[str, Any]] = None, + site_drive: Optional[Dict[str, Any]] = None, + site_list: Optional[Dict[str, Any]] = None, + enrich_metadata_enabled: bool = True + ) -> Dict[str, Any]: + + if not enrich_metadata_enabled: + return document + + self._log_info(f"Enriching document {document.get('_id', 'unknown')} with metadata") + enriched_document = document.copy() + + try: + metadata_array = self.build_metadata_array(enriched_document, site, site_drive, site_list) + enriched_document["metadata"] = metadata_array + + self._log_info(f"Successfully enriched document with {len(metadata_array)} metadata pairs") + + except Exception as e: + self._log_warning(f"Failed to enrich document {enriched_document.get('_id')} with metadata: {str(e)}") + # Ensure at least an empty metadata array with required fields + enriched_document["metadata"] = [ + {"key": "Category", "value": None}, + {"key": "Site Name", "value": None}, + {"key": "Document Library", "value": None}, + {"key": "Division", "value": None}, + {"key": "Department", "value": None}, + {"key": "Content-Type", "value": None}, + {"key": "File Type", "value": None}, + {"key": "File Path", "value": None} + ] + + return enriched_document diff --git a/connectors/sources/sharepoint_online.py b/connectors/sources/sharepoint_online.py index 5786bcb4f..85576f51c 100644 --- a/connectors/sources/sharepoint_online.py +++ b/connectors/sources/sharepoint_online.py @@ -33,6 +33,7 @@ ) from connectors.logger import logger from connectors.source import CURSOR_SYNC_TIMESTAMP, BaseDataSource +from connectors.sources.sharepoint_metadata_enricher import SharePointMetadataEnricher from connectors.utils import ( TIKA_SUPPORTED_FILETYPES, CacheWithTimeout, @@ -74,6 +75,7 @@ FILE_WRITE_CHUNK_SIZE = 1024 * 64 # 64KB default SSD page size MAX_DOCUMENT_SIZE = 10485760 WILDCARD = "*" +# Base fields for all drive items DRIVE_ITEMS_FIELDS = "id,content.downloadUrl,lastModifiedDateTime,lastModifiedBy,root,deleted,file,folder,package,name,webUrl,createdBy,createdDateTime,size,parentReference" CURSOR_SITE_DRIVE_KEY = "site_drives" @@ -499,6 +501,14 @@ async def _get(self, absolute_url, retry_count=0): try: token = await self._api_token.get() headers = {"authorization": f"Bearer {token}"} + + # If the absolute_url contains "/sites?expand=sites" then add header prefer: HonorNonIndexedQueriesWarningMayFailRandomly as per Microsoft Support to handle sites with many subsites + if "/sites?expand=sites" in absolute_url: + headers["prefer"] = "HonorNonIndexedQueriesWarningMayFailRandomly" + self._logger.info( + f"Adding header prefer: HonorNonIndexedQueriesWarningMayFailRandomly to request {absolute_url}" + ) + self._logger.debug(f"Calling Sharepoint Endpoint: {absolute_url}") async with self._http_session.get( @@ -877,10 +887,25 @@ async def drive_items_delta(self, url): if "value" in response and len(response["value"]) > 0: yield DriveItemsPage(response["value"], delta_link) - async def drive_items(self, drive_id, url=None): + async def drive_items(self, drive_id, url=None, site=None, metadata_enricher=None): + # Build field list with conditional ODC properties + fields = DRIVE_ITEMS_FIELDS + + # Add ODC managed properties if this is an ODC site + # Use metadata enricher's ODC detection if available, otherwise fall back to local method + is_odc = False + if metadata_enricher and hasattr(metadata_enricher, 'should_include_odc_properties'): + is_odc = metadata_enricher.should_include_odc_properties(site) + elif site: + is_odc = self._is_odc_site(site) + + if is_odc and metadata_enricher: + odc_properties = metadata_enricher.get_odc_managed_properties() + fields = f"{DRIVE_ITEMS_FIELDS},{odc_properties}" + url = ( ( - f"{GRAPH_API_URL}/drives/{drive_id}/root/delta?$select={DRIVE_ITEMS_FIELDS}" + f"{GRAPH_API_URL}/drives/{drive_id}/root/delta?$select={fields}" ) if not url else url @@ -889,6 +914,19 @@ async def drive_items(self, drive_id, url=None): async for page in self.drive_items_delta(url): yield page + def _is_odc_site(self, site): + """Check if site is an ODC (Office Development Center) site based on URL or name patterns.""" + if not site: + return False + + web_url = site.get("webUrl", "").lower() + site_name = site.get("name", "").lower() + + # Check for ODC indicators in URL or site name + odc_indicators = ["odc", "office-development", "dev-center", "development-center"] + + return any(indicator in web_url or indicator in site_name for indicator in odc_indicators) + async def drive_items_permissions_batch(self, drive_id, drive_item_ids): requests = [] @@ -1228,9 +1266,24 @@ def __init__(self, configuration): self._client = None self.site_group_cache = {} + self._metadata_enricher = None def _set_internal_logger(self): self.client.set_logger(self._logger) + # Initialize metadata enricher with logger and graph api client + self._metadata_enricher = SharePointMetadataEnricher( + logger=self._logger, + graph_api_client=self.client._graph_api_client + ) + + @property + def metadata_enricher(self): + if not self._metadata_enricher: + self._metadata_enricher = SharePointMetadataEnricher( + logger=self._logger, + graph_api_client=self.client._graph_api_client + ) + return self._metadata_enricher @property def client(self): @@ -1392,6 +1445,15 @@ def get_default_configuration(cls): "type": "bool", "value": True, }, + "enrich_metadata": { + "display": "toggle", + "label": "Enrich documents with metadata", + "order": 17, + "tooltip": "Enable this option to enrich all documents with structured metadata including category, division, content type, and other SharePoint managed properties. The metadata will be stored as an array of key-value pairs in a 'metadata' field.", + "type": "bool", + "value": True, + "ui_restrictions": ["advanced"], + }, } async def validate_config(self): @@ -1454,6 +1516,28 @@ def _decorate_with_access_control(self, document, access_control): return document + def _enrich_document_with_metadata(self, document, site=None, site_drive=None, site_list=None): + """ + Enrich document with metadata using the dedicated metadata enricher. + + Args: + document: The document to enrich + site: Site context + site_drive: Drive context + site_list: List context + + Returns: + dict: Document enriched with metadata + """ + enrich_enabled = bool(self.configuration.get("enrich_metadata", True)) + return self.metadata_enricher.enrich_document_with_metadata( + document=document, + site=site, + site_drive=site_drive, + site_list=site_list, + enrich_metadata_enabled=enrich_enabled + ) + async def _site_access_control(self, site): """Fetches all permissions for all owners, members and visitors of a given site. All groups and/or persons, which have permissions for a given site are returned with their given identity prefix ("user", "group" or "email"). @@ -1743,6 +1827,8 @@ async def get_docs(self, filtering=None): max_drive_item_age = advanced_rules["skipExtractingDriveItemsOlderThan"] async for site_collection in self.site_collections(): + # Enrich site collection with metadata + site_collection = self._enrich_document_with_metadata(site_collection) yield site_collection, None async for site in self.sites( @@ -1754,20 +1840,30 @@ async def get_docs(self, filtering=None): site_admin_access_control, ) = await self._site_access_control(site) + # Enrich site with metadata and access control + enriched_site = self._enrich_document_with_metadata(site) + enriched_site = self._decorate_with_access_control(enriched_site, site_access_control) + yield ( - self._decorate_with_access_control(site, site_access_control), + enriched_site, None, ) async for site_drive in self.site_drives(site): + # Enrich site drive with metadata and access control + enriched_site_drive = self._enrich_document_with_metadata( + site_drive, site=site, site_drive=site_drive + ) + enriched_site_drive = self._decorate_with_access_control( + enriched_site_drive, site_access_control + ) + yield ( - self._decorate_with_access_control( - site_drive, site_access_control - ), + enriched_site_drive, None, ) - async for page in self.client.drive_items(site_drive["id"]): + async for page in self.client.drive_items(site_drive["id"], site=site, metadata_enricher=self.metadata_enricher): for drive_items_batch in iterable_batches_generator( page.items, SPO_API_MAX_BATCH_SIZE ): @@ -1782,6 +1878,17 @@ async def get_docs(self, filtering=None): "lastModifiedDateTime" ) + # HYBRID APPROACH: Enrich drive item with SharePoint list metadata using listItem/fields + if self.configuration.get("enrich_metadata", True): + drive_item = await self.metadata_enricher.enrich_drive_item_with_list_metadata( + drive_item, site["id"] + ) + + # Enrich with metadata + drive_item = self._enrich_document_with_metadata( + drive_item, site=site, site_drive=site_drive + ) + # Drive items should inherit site access controls only if # 'fetch_drive_item_permissions' is disabled in the config if not self.configuration[ @@ -1845,6 +1952,8 @@ async def get_docs_incrementally(self, sync_cursor, filtering=None): max_drive_item_age = advanced_rules["skipExtractingDriveItemsOlderThan"] async for site_collection in self.site_collections(): + # Enrich site collection with metadata + site_collection = self._enrich_document_with_metadata(site_collection) yield site_collection, None, OP_INDEX async for site in self.sites( @@ -1857,8 +1966,12 @@ async def get_docs_incrementally(self, sync_cursor, filtering=None): site_admin_access_control, ) = await self._site_access_control(site) + # Enrich site with metadata and access control + enriched_site = self._enrich_document_with_metadata(site) + enriched_site = self._decorate_with_access_control(enriched_site, site_access_control) + yield ( - self._decorate_with_access_control(site, site_access_control), + enriched_site, None, OP_INDEX, ) @@ -1867,10 +1980,16 @@ async def get_docs_incrementally(self, sync_cursor, filtering=None): # lastModifiedDateTime of the parent site_drive. Therefore, we # set check_timestamp to False when iterating over site_drives. async for site_drive in self.site_drives(site, check_timestamp=False): + # Enrich site drive with metadata and access control + enriched_site_drive = self._enrich_document_with_metadata( + site_drive, site=site, site_drive=site_drive + ) + enriched_site_drive = self._decorate_with_access_control( + enriched_site_drive, site_access_control + ) + yield ( - self._decorate_with_access_control( - site_drive, site_access_control - ), + enriched_site_drive, None, OP_INDEX, ) @@ -1878,7 +1997,7 @@ async def get_docs_incrementally(self, sync_cursor, filtering=None): delta_link = self.get_drive_delta_link(site_drive["id"]) async for page in self.client.drive_items( - drive_id=site_drive["id"], url=delta_link + drive_id=site_drive["id"], url=delta_link, site=site, metadata_enricher=self.metadata_enricher ): for drive_items_batch in iterable_batches_generator( page.items, SPO_API_MAX_BATCH_SIZE @@ -1894,6 +2013,17 @@ async def get_docs_incrementally(self, sync_cursor, filtering=None): "lastModifiedDateTime" ) + # HYBRID APPROACH: Enrich drive item with SharePoint list metadata using listItem/fields + if self.configuration.get("enrich_metadata", True): + drive_item = await self.metadata_enricher.enrich_drive_item_with_list_metadata( + drive_item, site["id"] + ) + + # Enrich with metadata + drive_item = self._enrich_document_with_metadata( + drive_item, site=site, site_drive=site_drive + ) + # Drive items should inherit site access controls only if # 'fetch_drive_item_permissions' is disabled in the config if not self.configuration[ @@ -2091,13 +2221,18 @@ def _get_login_name(permissions, label): return self._decorate_with_access_control(drive_item, access_control) - async def drive_items(self, site_drive, max_drive_item_age): - async for page in self.client.drive_items(site_drive["id"]): + async def drive_items(self, site_drive, max_drive_item_age, site=None): + async for page in self.client.drive_items(site_drive["id"], site=site, metadata_enricher=self.metadata_enricher): for drive_item in page: drive_item["_id"] = drive_item["id"] drive_item["object_type"] = "drive_item" drive_item["_timestamp"] = drive_item["lastModifiedDateTime"] + # Enrich with metadata + drive_item = self._enrich_document_with_metadata( + drive_item, site=site, site_drive=site_drive + ) + yield drive_item, self.download_function(drive_item, max_drive_item_age) async def site_list_items( @@ -2207,11 +2342,21 @@ async def site_list_items( ACCESS_CONTROL, [] ) + # Enrich attachment with metadata before yielding + list_item_attachment = self._enrich_document_with_metadata( + list_item_attachment, site=site, site_list={"id": site_list_id, "name": site_list_name} + ) + attachment_download_func = partial( self.get_attachment_content, list_item_attachment ) yield list_item_attachment, attachment_download_func + # Enrich list item with metadata before yielding + list_item = self._enrich_document_with_metadata( + list_item, site=site, site_list={"id": site_list_id, "name": site_list_name} + ) + yield list_item, None async def site_lists(self, site, site_access_control, check_timestamp=False): @@ -2264,6 +2409,11 @@ async def site_lists(self, site, site_access_control, check_timestamp=False): site_list, site_access_control ) + # Enrich site list with metadata before yielding + site_list = self._enrich_document_with_metadata( + site_list, site=site, site_list=site_list + ) + yield site_list async def _get_access_control_from_role_assignment(self, role_assignment): @@ -2399,6 +2549,11 @@ async def site_pages(self, site, site_access_control, check_timestamp=False): if html_field in site_page: site_page[html_field] = html_to_text(site_page[html_field]) + # Enrich site page with metadata before yielding + site_page = self._enrich_document_with_metadata( + site_page, site=site + ) + yield site_page def init_sync_cursor(self): @@ -2628,9 +2783,17 @@ async def _access_control_for_member(self, member): a reference to a group's owners, or an individual, and will act accordingly. :param member: The dict representing a generic SPO entity. May be a group or an individual :return: the access control list (ACL) for this "member" + + Detect when a member has the login name: c:0-.f|rolemanager|spo-grid-all-users. + Map it to a standard identifier in _allow_access_control """ login_name = member.get("LoginName") + # Handle "Everyone Except External Users" group + if login_name and login_name.startswith("c:0-.f|rolemanager|spo-grid-all-users"): + self._logger.debug(f"Detected 'Everyone Except External Users' group: '{member.get('Title')}'.") + return ["group:EveryoneExceptExternalUsers"] + # 'LoginName' looking like a group indicates a group is_group = ( login_name.startswith("c:0o.c|federateddirectoryclaimprovider|") diff --git a/tests/sources/test_sharepoint_online.py b/tests/sources/test_sharepoint_online.py index dea56c28f..401b5cca3 100644 --- a/tests/sources/test_sharepoint_online.py +++ b/tests/sources/test_sharepoint_online.py @@ -3672,6 +3672,18 @@ def test_prefix_user_id(self): }, [_prefix_group(GROUP_ONE_ID)], ), + ( + # Everyone Except External Users group (access control: mapped group identifier) + { + "Member": { + "odata.type": "SP.User", + "LoginName": "c:0-.f|rolemanager|spo-grid-all-users", + "Title": "Everyone except external users", + }, + "RoleDefinitionBindings": READ_BINDING, + }, + ["group:EveryoneExceptExternalUsers"], + ), ( # Unknown type (access control: nothing) {