From 1eea22cbe39e478d1a5943ce5dad26d453f137a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lle=C3=AFr=20Borr=C3=A0s=20Metje?= Date: Tue, 10 Jun 2025 09:55:36 +0200 Subject: [PATCH 1/9] HonorNonIndexedQueriesWarningMayFailRandomly header --- connectors/sources/sharepoint_online.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/connectors/sources/sharepoint_online.py b/connectors/sources/sharepoint_online.py index 5786bcb4f..d90e2cadd 100644 --- a/connectors/sources/sharepoint_online.py +++ b/connectors/sources/sharepoint_online.py @@ -499,6 +499,14 @@ async def _get(self, absolute_url, retry_count=0): try: token = await self._api_token.get() headers = {"authorization": f"Bearer {token}"} + + # If the absolute_url contains "/sites?expand=sites" then add header prefer: HonorNonIndexedQueriesWarningMayFailRandomly as per Microsoft Support to handle sites with many subsites + if "/sites?expand=sites" in absolute_url: + headers["prefer"] = "HonorNonIndexedQueriesWarningMayFailRandomly" + self._logger.info( + f"Adding header prefer: HonorNonIndexedQueriesWarningMayFailRandomly to request {absolute_url}" + ) + self._logger.debug(f"Calling Sharepoint Endpoint: {absolute_url}") async with self._http_session.get( From 90a0a11be11a9174d66ba47cc56fdcdb3ca5911c Mon Sep 17 00:00:00 2001 From: claudiu Date: Tue, 8 Jul 2025 13:09:48 +0300 Subject: [PATCH 2/9] Handle Everyone Except External Users group --- connectors/sources/sharepoint_online.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/connectors/sources/sharepoint_online.py b/connectors/sources/sharepoint_online.py index d90e2cadd..89d1ee638 100644 --- a/connectors/sources/sharepoint_online.py +++ b/connectors/sources/sharepoint_online.py @@ -2636,9 +2636,17 @@ async def _access_control_for_member(self, member): a reference to a group's owners, or an individual, and will act accordingly. :param member: The dict representing a generic SPO entity. May be a group or an individual :return: the access control list (ACL) for this "member" + + Detect when a member has the login name: c:0-.f|rolemanager|spo-grid-all-users. + Map it to a standard identifier in _allow_access_control, for example: group:EveryoneExceptExternalUsers """ login_name = member.get("LoginName") + # Handle "Everyone Except External Users" group + if login_name == "c:0-.f|rolemanager|spo-grid-all-users": + self._logger.debug(f"Detected 'Everyone Except External Users' group: '{member.get('Title')}'.") + return ["group:EveryoneExceptExternalUsers"] + # 'LoginName' looking like a group indicates a group is_group = ( login_name.startswith("c:0o.c|federateddirectoryclaimprovider|") From cce8f2ebd29f8416fc87a7080a19e1e6c23c4481 Mon Sep 17 00:00:00 2001 From: claudiu Date: Tue, 8 Jul 2025 13:11:36 +0300 Subject: [PATCH 3/9] Updated comment --- connectors/sources/sharepoint_online.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/connectors/sources/sharepoint_online.py b/connectors/sources/sharepoint_online.py index 89d1ee638..25f3da44e 100644 --- a/connectors/sources/sharepoint_online.py +++ b/connectors/sources/sharepoint_online.py @@ -2638,7 +2638,7 @@ async def _access_control_for_member(self, member): :return: the access control list (ACL) for this "member" Detect when a member has the login name: c:0-.f|rolemanager|spo-grid-all-users. - Map it to a standard identifier in _allow_access_control, for example: group:EveryoneExceptExternalUsers + Map it to a standard identifier in _allow_access_control """ login_name = member.get("LoginName") From 9a4f14fcd4dccd1838f8c585fa45cb490daee2da Mon Sep 17 00:00:00 2001 From: claudiu Date: Wed, 9 Jul 2025 18:25:28 +0300 Subject: [PATCH 4/9] Added test and updated the check for external users --- connectors/sources/sharepoint_online.py | 2 +- tests/sources/test_sharepoint_online.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/connectors/sources/sharepoint_online.py b/connectors/sources/sharepoint_online.py index 25f3da44e..0cadc78d8 100644 --- a/connectors/sources/sharepoint_online.py +++ b/connectors/sources/sharepoint_online.py @@ -2643,7 +2643,7 @@ async def _access_control_for_member(self, member): login_name = member.get("LoginName") # Handle "Everyone Except External Users" group - if login_name == "c:0-.f|rolemanager|spo-grid-all-users": + if login_name and login_name.startswith("c:0-.f|rolemanager|spo-grid-all-users"): self._logger.debug(f"Detected 'Everyone Except External Users' group: '{member.get('Title')}'.") return ["group:EveryoneExceptExternalUsers"] diff --git a/tests/sources/test_sharepoint_online.py b/tests/sources/test_sharepoint_online.py index dea56c28f..401b5cca3 100644 --- a/tests/sources/test_sharepoint_online.py +++ b/tests/sources/test_sharepoint_online.py @@ -3672,6 +3672,18 @@ def test_prefix_user_id(self): }, [_prefix_group(GROUP_ONE_ID)], ), + ( + # Everyone Except External Users group (access control: mapped group identifier) + { + "Member": { + "odata.type": "SP.User", + "LoginName": "c:0-.f|rolemanager|spo-grid-all-users", + "Title": "Everyone except external users", + }, + "RoleDefinitionBindings": READ_BINDING, + }, + ["group:EveryoneExceptExternalUsers"], + ), ( # Unknown type (access control: nothing) { From fdecd34ff5bf286082c0fccd774a96905edc59ee Mon Sep 17 00:00:00 2001 From: claudiu Date: Wed, 16 Jul 2025 13:30:51 +0300 Subject: [PATCH 5/9] Added new metadata from sharepoint --- connectors/sources/sharepoint_online.py | 315 +++++++++++++++++++++++- 1 file changed, 306 insertions(+), 9 deletions(-) diff --git a/connectors/sources/sharepoint_online.py b/connectors/sources/sharepoint_online.py index 0cadc78d8..413ba846a 100644 --- a/connectors/sources/sharepoint_online.py +++ b/connectors/sources/sharepoint_online.py @@ -1400,6 +1400,15 @@ def get_default_configuration(cls): "type": "bool", "value": True, }, + "enrich_metadata": { + "display": "toggle", + "label": "Enrich documents with metadata", + "order": 17, + "tooltip": "Enable this option to enrich all documents with structured metadata including category, division, content type, and other SharePoint managed properties. The metadata will be stored as an array of key-value pairs in a 'metadata' field.", + "type": "bool", + "value": True, + "ui_restrictions": ["advanced"], + }, } async def validate_config(self): @@ -1462,6 +1471,235 @@ def _decorate_with_access_control(self, document, access_control): return document + def _extract_metadata_from_sharepoint_fields(self, document, site=None, site_drive=None, site_list=None): + """ + Extract metadata from SharePoint fields and map them to standardized metadata structure. + + Args: + document: The SharePoint document/item + site: Site information (optional) + site_drive: Drive information (optional) + site_list: List information (optional) + + Returns: + dict: Extracted metadata as key-value pairs + """ + metadata = {} + + + fields = document.get("fields", {}) + + # Map SharePoint managed properties to your metadata structure + # Based on your SEARCH Managed properties table + # Category mapping + odc_category = fields.get("OPDCategory") or fields.get("ODCCategory") + if odc_category: + metadata["Category"] = odc_category + else: + # Try to determine category from site URL or other indicators + if site and site.get("webUrl"): + site_url = site["webUrl"].lower() + if "odc" in site_url: + metadata["Category"] = "ODC" + elif "xdesk" in site_url: + metadata["Category"] = "Xdesk" + else: + metadata["Category"] = None + else: + metadata["Category"] = None + + # Business Unit / Division + metadata["Division"] = fields.get("BusinessUnit") + metadata["Department"] = fields.get("BusinessUnit") # Using same field as Department + + # Document Type + metadata["Content-Type"] = fields.get("DocumentType") or self._determine_content_type(document) + + # Activity and Project information + metadata["ActivityID"] = fields.get("ActivityID") + metadata["ActivityName"] = fields.get("ActivityName") + metadata["ProjectID"] = fields.get("ProjectID") + metadata["ProjectType"] = fields.get("ProjectType") + + # Geographic and temporal metadata + metadata["Region"] = fields.get("Region") + metadata["FocusCountry"] = fields.get("FocusCountry") + metadata["Year"] = fields.get("Year") + metadata["Phase"] = fields.get("Phase") + + # Status and classification + metadata["Status"] = fields.get("OPDStatus") + metadata["GrantType"] = fields.get("GrantType") + metadata["GrantWindow"] = fields.get("GrantWindow") + + # Boolean flags + metadata["Disclosable"] = fields.get("Disclosable") + metadata["NonIFAD"] = fields.get("NonIfad") + metadata["PLF"] = fields.get("PLF") + + # System information + metadata["SystemSource"] = fields.get("ODCIntegration_SystemSource") + + return metadata + + def _determine_content_type(self, document): + """ + Determine content type based on document properties. + + Args: + document: The SharePoint document + + Returns: + str: Content type classification + """ + object_type = document.get("object_type", "") + + if object_type == "drive_item": + name = document.get("name", "") + if "folder" in document: + return "Folder" + elif name: + ext = os.path.splitext(name)[-1].lower() + if ext in ['.ppt', '.pptx']: + return "Presentation" + elif ext in ['.doc', '.docx', '.pdf']: + return "Document" + elif ext in ['.xls', '.xlsx']: + return "Spreadsheet" + elif ext in ['.mp4', '.avi', '.mov']: + return "Video" + elif ext in ['.jpg', '.jpeg', '.png', '.gif']: + return "Image" + else: + return "Document" + elif object_type == "site_page": + return "Web Page" + elif object_type == "list_item": + return "List Item" + elif object_type == "list_item_attachment": + return "Attachment" + else: + return "Document" + + def _build_metadata_array(self, document, site=None, site_drive=None, site_list=None): + """ + Build metadata array as key-value pairs for the document. + + Args: + document: The SharePoint document + site: Site context + site_drive: Drive context + site_list: List context + + Returns: + list: Array of metadata key-value pairs + """ + metadata_pairs = [] + + # Extract SharePoint-specific metadata + sharepoint_metadata = self._extract_metadata_from_sharepoint_fields( + document, site, site_drive, site_list + ) + + # Standard metadata that should always be present + + # Site Name + site_name = None + if site: + site_name = site.get("displayName") or site.get("name") or site.get("title") + metadata_pairs.append({"key": "Site Name", "value": site_name}) + + # Document Library / Drive Name + library_name = None + if site_drive: + library_name = site_drive.get("name") or site_drive.get("displayName") + elif site_list: + library_name = site_list.get("name") or site_list.get("displayName") + metadata_pairs.append({"key": "Document Library", "value": library_name}) + + # File Type/Extension + file_extension = None + file_name = document.get("name") or document.get("_original_filename") or document.get("FileName", "") + if file_name and "." in file_name: + file_extension = os.path.splitext(file_name)[-1].lower() + metadata_pairs.append({"key": "File Type", "value": file_extension}) + + # File Path/Location + file_path = None + if document.get("webUrl"): + file_path = document["webUrl"] + elif document.get("parentReference", {}).get("path"): + file_path = document["parentReference"]["path"] + elif site and site.get("webUrl"): + # Construct path from site URL and document name + site_path = self._site_path_from_web_url(site["webUrl"]) + if file_name: + file_path = f"{site_path}/{file_name}" + else: + file_path = site_path + metadata_pairs.append({"key": "File Path", "value": file_path}) + + # Add all SharePoint metadata fields + for key, value in sharepoint_metadata.items(): + metadata_pairs.append({"key": key, "value": value}) + + # Additional technical metadata + metadata_pairs.append({"key": "Object Type", "value": document.get("object_type")}) + metadata_pairs.append({"key": "Document ID", "value": document.get("_id")}) + metadata_pairs.append({"key": "Last Modified", "value": document.get("_timestamp") or document.get("lastModifiedDateTime")}) + + # Size information for files + if document.get("size"): + metadata_pairs.append({"key": "File Size", "value": document.get("size")}) + + # Creator information + created_by = None + if document.get("createdBy", {}).get("user", {}).get("displayName"): + created_by = document["createdBy"]["user"]["displayName"] + elif document.get("createdBy", {}).get("user", {}).get("email"): + created_by = document["createdBy"]["user"]["email"] + metadata_pairs.append({"key": "Created By", "value": created_by}) + + # Modified by information + modified_by = None + if document.get("lastModifiedBy", {}).get("user", {}).get("displayName"): + modified_by = document["lastModifiedBy"]["user"]["displayName"] + elif document.get("lastModifiedBy", {}).get("user", {}).get("email"): + modified_by = document["lastModifiedBy"]["user"]["email"] + metadata_pairs.append({"key": "Modified By", "value": modified_by}) + + return metadata_pairs + + def _enrich_document_with_metadata(self, document, site=None, site_drive=None, site_list=None): + """ + Enrich document with metadata array. + + Args: + document: The document to enrich + site: Site context + site_drive: Drive context + site_list: List context + + Returns: + dict: Document enriched with metadata + """ + # Check if metadata enrichment is enabled + if not self.configuration.get("enrich_metadata", True): + return document + + try: + metadata_array = self._build_metadata_array(document, site, site_drive, site_list) + document["metadata"] = metadata_array + + self._logger.debug(f"Enriched document {document.get('_id')} with {len(metadata_array)} metadata pairs") + + except Exception as e: + self._logger.warning(f"Failed to enrich document {document.get('_id')} with metadata: {str(e)}") + # Ensure at least an empty metadata array + document["metadata"] = [] + + return document + async def _site_access_control(self, site): """Fetches all permissions for all owners, members and visitors of a given site. All groups and/or persons, which have permissions for a given site are returned with their given identity prefix ("user", "group" or "email"). @@ -1751,6 +1989,8 @@ async def get_docs(self, filtering=None): max_drive_item_age = advanced_rules["skipExtractingDriveItemsOlderThan"] async for site_collection in self.site_collections(): + # Enrich site collection with metadata + site_collection = self._enrich_document_with_metadata(site_collection) yield site_collection, None async for site in self.sites( @@ -1762,16 +2002,26 @@ async def get_docs(self, filtering=None): site_admin_access_control, ) = await self._site_access_control(site) + # Enrich site with metadata and access control + enriched_site = self._enrich_document_with_metadata(site) + enriched_site = self._decorate_with_access_control(enriched_site, site_access_control) + yield ( - self._decorate_with_access_control(site, site_access_control), + enriched_site, None, ) async for site_drive in self.site_drives(site): + # Enrich site drive with metadata and access control + enriched_site_drive = self._enrich_document_with_metadata( + site_drive, site=site, site_drive=site_drive + ) + enriched_site_drive = self._decorate_with_access_control( + enriched_site_drive, site_access_control + ) + yield ( - self._decorate_with_access_control( - site_drive, site_access_control - ), + enriched_site_drive, None, ) @@ -1790,6 +2040,11 @@ async def get_docs(self, filtering=None): "lastModifiedDateTime" ) + # Enrich with metadata + drive_item = self._enrich_document_with_metadata( + drive_item, site=site, site_drive=site_drive + ) + # Drive items should inherit site access controls only if # 'fetch_drive_item_permissions' is disabled in the config if not self.configuration[ @@ -1853,6 +2108,8 @@ async def get_docs_incrementally(self, sync_cursor, filtering=None): max_drive_item_age = advanced_rules["skipExtractingDriveItemsOlderThan"] async for site_collection in self.site_collections(): + # Enrich site collection with metadata + site_collection = self._enrich_document_with_metadata(site_collection) yield site_collection, None, OP_INDEX async for site in self.sites( @@ -1865,8 +2122,12 @@ async def get_docs_incrementally(self, sync_cursor, filtering=None): site_admin_access_control, ) = await self._site_access_control(site) + # Enrich site with metadata and access control + enriched_site = self._enrich_document_with_metadata(site) + enriched_site = self._decorate_with_access_control(enriched_site, site_access_control) + yield ( - self._decorate_with_access_control(site, site_access_control), + enriched_site, None, OP_INDEX, ) @@ -1875,10 +2136,16 @@ async def get_docs_incrementally(self, sync_cursor, filtering=None): # lastModifiedDateTime of the parent site_drive. Therefore, we # set check_timestamp to False when iterating over site_drives. async for site_drive in self.site_drives(site, check_timestamp=False): + # Enrich site drive with metadata and access control + enriched_site_drive = self._enrich_document_with_metadata( + site_drive, site=site, site_drive=site_drive + ) + enriched_site_drive = self._decorate_with_access_control( + enriched_site_drive, site_access_control + ) + yield ( - self._decorate_with_access_control( - site_drive, site_access_control - ), + enriched_site_drive, None, OP_INDEX, ) @@ -1902,6 +2169,11 @@ async def get_docs_incrementally(self, sync_cursor, filtering=None): "lastModifiedDateTime" ) + # Enrich with metadata + drive_item = self._enrich_document_with_metadata( + drive_item, site=site, site_drive=site_drive + ) + # Drive items should inherit site access controls only if # 'fetch_drive_item_permissions' is disabled in the config if not self.configuration[ @@ -2099,13 +2371,18 @@ def _get_login_name(permissions, label): return self._decorate_with_access_control(drive_item, access_control) - async def drive_items(self, site_drive, max_drive_item_age): + async def drive_items(self, site_drive, max_drive_item_age, site=None): async for page in self.client.drive_items(site_drive["id"]): for drive_item in page: drive_item["_id"] = drive_item["id"] drive_item["object_type"] = "drive_item" drive_item["_timestamp"] = drive_item["lastModifiedDateTime"] + # Enrich with metadata + drive_item = self._enrich_document_with_metadata( + drive_item, site=site, site_drive=site_drive + ) + yield drive_item, self.download_function(drive_item, max_drive_item_age) async def site_list_items( @@ -2215,11 +2492,21 @@ async def site_list_items( ACCESS_CONTROL, [] ) + # Enrich attachment with metadata before yielding + list_item_attachment = self._enrich_document_with_metadata( + list_item_attachment, site=site, site_list={"id": site_list_id, "name": site_list_name} + ) + attachment_download_func = partial( self.get_attachment_content, list_item_attachment ) yield list_item_attachment, attachment_download_func + # Enrich list item with metadata before yielding + list_item = self._enrich_document_with_metadata( + list_item, site=site, site_list={"id": site_list_id, "name": site_list_name} + ) + yield list_item, None async def site_lists(self, site, site_access_control, check_timestamp=False): @@ -2272,6 +2559,11 @@ async def site_lists(self, site, site_access_control, check_timestamp=False): site_list, site_access_control ) + # Enrich site list with metadata before yielding + site_list = self._enrich_document_with_metadata( + site_list, site=site, site_list=site_list + ) + yield site_list async def _get_access_control_from_role_assignment(self, role_assignment): @@ -2407,6 +2699,11 @@ async def site_pages(self, site, site_access_control, check_timestamp=False): if html_field in site_page: site_page[html_field] = html_to_text(site_page[html_field]) + # Enrich site page with metadata before yielding + site_page = self._enrich_document_with_metadata( + site_page, site=site + ) + yield site_page def init_sync_cursor(self): From 3946e62aed073206e23067e6c79c585222d112c0 Mon Sep 17 00:00:00 2001 From: claudiu Date: Thu, 17 Jul 2025 10:17:01 +0300 Subject: [PATCH 6/9] Added ODC_MANAGED_PROPERTIES that append the DRIVE_ITEMS_FIELDS for the ODC category --- connectors/sources/sharepoint_online.py | 100 +++++++++++++++--------- 1 file changed, 63 insertions(+), 37 deletions(-) diff --git a/connectors/sources/sharepoint_online.py b/connectors/sources/sharepoint_online.py index 413ba846a..5e608e990 100644 --- a/connectors/sources/sharepoint_online.py +++ b/connectors/sources/sharepoint_online.py @@ -74,8 +74,12 @@ FILE_WRITE_CHUNK_SIZE = 1024 * 64 # 64KB default SSD page size MAX_DOCUMENT_SIZE = 10485760 WILDCARD = "*" +# Base fields for all drive items DRIVE_ITEMS_FIELDS = "id,content.downloadUrl,lastModifiedDateTime,lastModifiedBy,root,deleted,file,folder,package,name,webUrl,createdBy,createdDateTime,size,parentReference" +# Additional ODC-specific managed properties +ODC_MANAGED_PROPERTIES = "ActivityID,BusinessUnit,OPDCategory,LOB,Division,DocumentType,FinancialYear,Quarter,Month,Owner,Reviewer,Approver,Status,Priority,Confidentiality,Retention,Compliance,RelatedProjects,Tags,Keywords,Notes" + CURSOR_SITE_DRIVE_KEY = "site_drives" # Microsoft Graph API Delta constants @@ -885,10 +889,17 @@ async def drive_items_delta(self, url): if "value" in response and len(response["value"]) > 0: yield DriveItemsPage(response["value"], delta_link) - async def drive_items(self, drive_id, url=None): + async def drive_items(self, drive_id, url=None, site=None): + # Build field list with conditional ODC properties + fields = DRIVE_ITEMS_FIELDS + + # Add ODC managed properties if this is an ODC site + if site and self._is_odc_site(site): + fields = f"{DRIVE_ITEMS_FIELDS},{ODC_MANAGED_PROPERTIES}" + url = ( ( - f"{GRAPH_API_URL}/drives/{drive_id}/root/delta?$select={DRIVE_ITEMS_FIELDS}" + f"{GRAPH_API_URL}/drives/{drive_id}/root/delta?$select={fields}" ) if not url else url @@ -897,6 +908,19 @@ async def drive_items(self, drive_id, url=None): async for page in self.drive_items_delta(url): yield page + def _is_odc_site(self, site): + """Check if site is an ODC (Office Development Center) site based on URL or name patterns.""" + if not site: + return False + + web_url = site.get("webUrl", "").lower() + site_name = site.get("name", "").lower() + + # Check for ODC indicators in URL or site name + odc_indicators = ["odc", "office-development", "dev-center", "development-center"] + + return any(indicator in web_url or indicator in site_name for indicator in odc_indicators) + async def drive_items_permissions_batch(self, drive_id, drive_item_ids): requests = [] @@ -1501,6 +1525,39 @@ def _extract_metadata_from_sharepoint_fields(self, document, site=None, site_dri site_url = site["webUrl"].lower() if "odc" in site_url: metadata["Category"] = "ODC" + + # Business Unit / Division + metadata["Division"] = fields.get("BusinessUnit") + metadata["Department"] = fields.get("BusinessUnit") # Using same field as Department + + # Document Type + metadata["Content-Type"] = fields.get("DocumentType") or self._determine_content_type(document) + + # Activity and Project information + metadata["ActivityID"] = fields.get("ActivityID") + metadata["ActivityName"] = fields.get("ActivityName") + metadata["ProjectID"] = fields.get("ProjectID") + metadata["ProjectType"] = fields.get("ProjectType") + + # Geographic and temporal metadata + metadata["Region"] = fields.get("Region") + metadata["FocusCountry"] = fields.get("FocusCountry") + metadata["Year"] = fields.get("Year") + metadata["Phase"] = fields.get("Phase") + + # Status and classification + metadata["Status"] = fields.get("OPDStatus") + metadata["GrantType"] = fields.get("GrantType") + metadata["GrantWindow"] = fields.get("GrantWindow") + + # Boolean flags + metadata["Disclosable"] = fields.get("Disclosable") + metadata["NonIFAD"] = fields.get("NonIfad") + metadata["PLF"] = fields.get("PLF") + + # System information + metadata["SystemSource"] = fields.get("ODCIntegration_SystemSource") + elif "xdesk" in site_url: metadata["Category"] = "Xdesk" else: @@ -1508,38 +1565,7 @@ def _extract_metadata_from_sharepoint_fields(self, document, site=None, site_dri else: metadata["Category"] = None - # Business Unit / Division - metadata["Division"] = fields.get("BusinessUnit") - metadata["Department"] = fields.get("BusinessUnit") # Using same field as Department - - # Document Type - metadata["Content-Type"] = fields.get("DocumentType") or self._determine_content_type(document) - - # Activity and Project information - metadata["ActivityID"] = fields.get("ActivityID") - metadata["ActivityName"] = fields.get("ActivityName") - metadata["ProjectID"] = fields.get("ProjectID") - metadata["ProjectType"] = fields.get("ProjectType") - - # Geographic and temporal metadata - metadata["Region"] = fields.get("Region") - metadata["FocusCountry"] = fields.get("FocusCountry") - metadata["Year"] = fields.get("Year") - metadata["Phase"] = fields.get("Phase") - - # Status and classification - metadata["Status"] = fields.get("OPDStatus") - metadata["GrantType"] = fields.get("GrantType") - metadata["GrantWindow"] = fields.get("GrantWindow") - - # Boolean flags - metadata["Disclosable"] = fields.get("Disclosable") - metadata["NonIFAD"] = fields.get("NonIfad") - metadata["PLF"] = fields.get("PLF") - - # System information - metadata["SystemSource"] = fields.get("ODCIntegration_SystemSource") - + return metadata def _determine_content_type(self, document): @@ -2025,7 +2051,7 @@ async def get_docs(self, filtering=None): None, ) - async for page in self.client.drive_items(site_drive["id"]): + async for page in self.client.drive_items(site_drive["id"], site=site): for drive_items_batch in iterable_batches_generator( page.items, SPO_API_MAX_BATCH_SIZE ): @@ -2153,7 +2179,7 @@ async def get_docs_incrementally(self, sync_cursor, filtering=None): delta_link = self.get_drive_delta_link(site_drive["id"]) async for page in self.client.drive_items( - drive_id=site_drive["id"], url=delta_link + drive_id=site_drive["id"], url=delta_link, site=site ): for drive_items_batch in iterable_batches_generator( page.items, SPO_API_MAX_BATCH_SIZE @@ -2372,7 +2398,7 @@ def _get_login_name(permissions, label): return self._decorate_with_access_control(drive_item, access_control) async def drive_items(self, site_drive, max_drive_item_age, site=None): - async for page in self.client.drive_items(site_drive["id"]): + async for page in self.client.drive_items(site_drive["id"], site=site): for drive_item in page: drive_item["_id"] = drive_item["id"] drive_item["object_type"] = "drive_item" From e60dbf2ac8b02526ca31ff9853d84a322c37f4f1 Mon Sep 17 00:00:00 2001 From: claudiu Date: Tue, 29 Jul 2025 16:49:29 +0300 Subject: [PATCH 7/9] Moved the enrich metadata code onto a different file --- .../sources/sharepoint_metadata_enricher.py | 339 ++++++++++++++++++ connectors/sources/sharepoint_online.py | 237 +----------- 2 files changed, 358 insertions(+), 218 deletions(-) create mode 100644 connectors/sources/sharepoint_metadata_enricher.py diff --git a/connectors/sources/sharepoint_metadata_enricher.py b/connectors/sources/sharepoint_metadata_enricher.py new file mode 100644 index 000000000..19c2190cd --- /dev/null +++ b/connectors/sources/sharepoint_metadata_enricher.py @@ -0,0 +1,339 @@ +import os +from typing import Dict, List, Optional, Any + + +class SharePointMetadataEnricher: + + def __init__(self, logger=None): + self.logger = logger + + def _log_debug(self, message: str): + if self.logger: + self.logger.debug(message) + + def _log_info(self, message: str): + if self.logger: + self.logger.info(message) + + def _log_warning(self, message: str): + if self.logger: + self.logger.warning(message) + + def _extract_metadata_from_sharepoint_fields( + self, + document: Dict[str, Any], + site: Optional[Dict[str, Any]] = None, + site_drive: Optional[Dict[str, Any]] = None, + site_list: Optional[Dict[str, Any]] = None + ) -> Dict[str, Any]: + self._log_info(f"Starting metadata extraction for document {document.get('_id', 'unknown')}") + + metadata = {} + fields = document.get("fields", {}) + + self._log_debug(f"SharePoint fields for document {document.get('_id', 'unknown')}: {fields}") + self._log_info(f"Found {len(fields)} SharePoint fields for document {document.get('_id', 'unknown')}") + + if site and site.get("webUrl"): + site_url = site["webUrl"].lower() + if "odc" in site_url: + metadata["Category"] = "ODC" + self._log_info(f"Detected ODC category from site URL: {site_url}") + elif "xdesk" in site_url: + metadata["Category"] = "Xdesk" + self._log_info(f"Detected Xdesk category from site URL: {site_url}") + else: + metadata["Category"] = None + self._log_info(f"No specific category detected from site URL: {site_url}") + else: + metadata["Category"] = None + self._log_info("No site URL available for category detection") + + metadata["Division"] = fields.get("BusinessUnit") + metadata["Department"] = fields.get("BusinessUnit") + # Document Type + metadata["Content-Type"] = fields.get("DocumentType") or self._determine_content_type(document) + + # Activity and Project information + metadata["ActivityID"] = fields.get("ActivityID") + metadata["ActivityName"] = fields.get("ActivityName") + metadata["ProjectID"] = fields.get("ProjectID") + metadata["ProjectType"] = fields.get("ProjectType") + + # Geographic and temporal metadata + metadata["Region"] = fields.get("Region") + metadata["FocusCountry"] = fields.get("FocusCountry") + metadata["Year"] = fields.get("Year") + metadata["Phase"] = fields.get("Phase") + + # Status and classification + metadata["Status"] = fields.get("OPDStatus") or fields.get("Status") + metadata["GrantType"] = fields.get("GrantType") + metadata["GrantWindow"] = fields.get("GrantWindow") + + # Boolean flags + metadata["Disclosable"] = fields.get("Disclosable") + metadata["NonIFAD"] = fields.get("NonIfad") + metadata["PLF"] = fields.get("PLF") + + # System information + metadata["SystemSource"] = fields.get("ODCIntegration_SystemSource") + + # Additional common SharePoint fields that might be present + metadata["Title"] = fields.get("Title") + metadata["Author"] = fields.get("Author") + metadata["Editor"] = fields.get("Editor") + metadata["Created"] = fields.get("Created") + metadata["Modified"] = fields.get("Modified") + metadata["FileLeafRef"] = fields.get("FileLeafRef") + metadata["FileDirRef"] = fields.get("FileDirRef") + metadata["ContentType"] = fields.get("ContentType") + metadata["FileType"] = fields.get("File_x0020_Type") + + # Check for any OPD/ODC category fields + odc_category = fields.get("OPDCategory") or fields.get("ODCCategory") + if odc_category: + metadata["Category"] = odc_category + self._log_info(f"Override category with ODC field value: {odc_category}") + + self._log_info(f"Completed metadata extraction with {len(metadata)} fields for document {document.get('_id', 'unknown')}") + return metadata + + def _determine_content_type(self, document: Dict[str, Any]) -> str: + object_type = document.get("object_type", "") + + if object_type == "drive_item": + name = document.get("name", "") + if "folder" in document: + return "Folder" + elif name: + ext = os.path.splitext(name)[-1].lower() + if ext in ['.ppt', '.pptx']: + return "Presentation" + elif ext in ['.doc', '.docx', '.pdf']: + return "Document" + elif ext in ['.xls', '.xlsx']: + return "Spreadsheet" + elif ext in ['.mp4', '.avi', '.mov']: + return "Video" + elif ext in ['.jpg', '.jpeg', '.png', '.gif']: + return "Image" + else: + return "Document" + else: + return "Document" + elif object_type == "site_page": + return "Web Page" + elif object_type == "list_item": + return "List Item" + elif object_type == "list_item_attachment": + return "Attachment" + else: + return "Document" + + def _site_path_from_web_url(self, web_url: str) -> str: + url_parts = web_url.split("/sites/") + site_path_parts = url_parts[1:] + return "/sites/".join(site_path_parts) + + def build_metadata_array( + self, + document: Dict[str, Any], + site: Optional[Dict[str, Any]] = None, + site_drive: Optional[Dict[str, Any]] = None, + site_list: Optional[Dict[str, Any]] = None + ) -> List[Dict[str, Any]]: + # Build metadata array as key-value pairs for the document + self._log_info(f"Starting to build metadata array for document {document.get('_id', 'unknown')}") + metadata_pairs = [] + + try: + # Extract SharePoint-specific metadata + sharepoint_metadata = self._extract_metadata_from_sharepoint_fields( + document, site, site_drive, site_list + ) + + self._log_info(f"Building standard metadata pairs for document {document.get('_id', 'unknown')}") + + # Standard metadata that should always be present + + # Category (required field) + metadata_pairs.append({ + "key": "Category", + "value": sharepoint_metadata.get("Category") + }) + + # Site Name + site_name = None + if site: + site_name = site.get("displayName") or site.get("name") or site.get("title") + self._log_info(f"Found site name: {site_name}") + metadata_pairs.append({"key": "Site Name", "value": site_name}) + + # Document Library / Drive Name + library_name = None + if site_drive: + library_name = site_drive.get("name") or site_drive.get("displayName") + self._log_info(f"Found drive library: {library_name}") + elif site_list: + library_name = site_list.get("name") or site_list.get("displayName") + self._log_info(f"Found list library: {library_name}") + metadata_pairs.append({"key": "Document Library", "value": library_name}) + + # Division and Department (required fields) + metadata_pairs.append({ + "key": "Division", + "value": sharepoint_metadata.get("Division") + }) + metadata_pairs.append({ + "key": "Department", + "value": sharepoint_metadata.get("Department") + }) + + # Content-Type (required field) + metadata_pairs.append({ + "key": "Content-Type", + "value": sharepoint_metadata.get("Content-Type") + }) + + # File Type/Extension + file_extension = None + file_name = document.get("name") or document.get("_original_filename") or document.get("FileName", "") + if file_name and "." in file_name: + file_extension = os.path.splitext(file_name)[-1].lower() + self._log_info(f"Detected file extension: {file_extension} for file: {file_name}") + metadata_pairs.append({"key": "File Type", "value": file_extension}) + + # File Path/Location + file_path = None + if document.get("webUrl"): + file_path = document["webUrl"] + self._log_info(f"Using document webUrl as file path: {file_path}") + elif document.get("parentReference", {}).get("path"): + file_path = document["parentReference"]["path"] + self._log_info(f"Using parentReference path as file path: {file_path}") + elif site and site.get("webUrl"): + # Construct path from site URL and document name + site_path = self._site_path_from_web_url(site["webUrl"]) + if file_name: + file_path = f"{site_path}/{file_name}" + else: + file_path = site_path + self._log_info(f"Constructed file path from site URL: {file_path}") + metadata_pairs.append({"key": "File Path", "value": file_path}) + + # Add all SharePoint metadata fields for all documents + self._log_info(f"Adding SharePoint-specific fields for document {document.get('_id', 'unknown')}") + sharepoint_fields = [ + "ActivityID", "ActivityName", "ProjectID", "ProjectType", + "Region", "FocusCountry", "Year", "Phase", "Status", + "GrantType", "GrantWindow", "Disclosable", "NonIFAD", + "PLF", "SystemSource", "Title", "Author", "Editor", + "Created", "Modified", "FileLeafRef", "FileDirRef", + "ContentType", "FileType" + ] + + added_fields_count = 0 + for field in sharepoint_fields: + if field in sharepoint_metadata and sharepoint_metadata[field] is not None: + metadata_pairs.append({ + "key": field, + "value": sharepoint_metadata[field] + }) + added_fields_count += 1 + + self._log_info(f"Added {added_fields_count} SharePoint-specific fields to metadata") + + # Additional technical metadata + self._log_info(f"Adding technical metadata for document {document.get('_id', 'unknown')}") + metadata_pairs.append({"key": "Object Type", "value": document.get("object_type")}) + metadata_pairs.append({"key": "Document ID", "value": document.get("_id")}) + metadata_pairs.append({ + "key": "Last Modified", + "value": document.get("_timestamp") or document.get("lastModifiedDateTime") + }) + + # Size information for files + if document.get("size"): + self._log_info(f"Found file size: {document.get('size')} bytes") + metadata_pairs.append({"key": "File Size", "value": document.get("size")}) + + # Creator information + created_by = None + if document.get("createdBy", {}).get("user", {}).get("displayName"): + created_by = document["createdBy"]["user"]["displayName"] + self._log_info(f"Found creator display name: {created_by}") + elif document.get("createdBy", {}).get("user", {}).get("email"): + created_by = document["createdBy"]["user"]["email"] + self._log_info(f"Found creator email: {created_by}") + metadata_pairs.append({"key": "Created By", "value": created_by}) + + # Modified by information + modified_by = None + if document.get("lastModifiedBy", {}).get("user", {}).get("displayName"): + modified_by = document["lastModifiedBy"]["user"]["displayName"] + self._log_info(f"Found last modifier display name: {modified_by}") + elif document.get("lastModifiedBy", {}).get("user", {}).get("email"): + modified_by = document["lastModifiedBy"]["user"]["email"] + self._log_info(f"Found last modifier email: {modified_by}") + metadata_pairs.append({"key": "Modified By", "value": modified_by}) + + self._log_info(f"Built {len(metadata_pairs)} metadata pairs for document {document.get('_id')}") + + except Exception as e: + self._log_warning(f"Error building metadata array for document {document.get('_id')}: {str(e)}") + # Return minimal metadata on error + self._log_info("Returning minimal metadata due to error") + metadata_pairs = [ + {"key": "Category", "value": None}, + {"key": "Site Name", "value": None}, + {"key": "Document Library", "value": None}, + {"key": "Division", "value": None}, + {"key": "Department", "value": None}, + {"key": "Content-Type", "value": None}, + {"key": "File Type", "value": None}, + {"key": "File Path", "value": None} + ] + + return metadata_pairs + + def enrich_document_with_metadata( + self, + document: Dict[str, Any], + site: Optional[Dict[str, Any]] = None, + site_drive: Optional[Dict[str, Any]] = None, + site_list: Optional[Dict[str, Any]] = None, + enrich_metadata_enabled: bool = True + ) -> Dict[str, Any]: + + if not enrich_metadata_enabled: + self._log_info(f"Metadata enrichment disabled for document {document.get('_id', 'unknown')}") + return document + + # Create a copy of the document to avoid modifying the original + self._log_info(f"Starting metadata enrichment for document {document.get('_id', 'unknown')}") + enriched_document = document.copy() + + try: + metadata_array = self.build_metadata_array(enriched_document, site, site_drive, site_list) + enriched_document["metadata"] = metadata_array + + self._log_info(f"Successfully enriched document {enriched_document.get('_id')} with {len(metadata_array)} metadata pairs") + + except Exception as e: + self._log_warning(f"Failed to enrich document {enriched_document.get('_id')} with metadata: {str(e)}") + # Ensure at least an empty metadata array with required fields + self._log_info("Setting fallback metadata array with required fields") + enriched_document["metadata"] = [ + {"key": "Category", "value": None}, + {"key": "Site Name", "value": None}, + {"key": "Document Library", "value": None}, + {"key": "Division", "value": None}, + {"key": "Department", "value": None}, + {"key": "Content-Type", "value": None}, + {"key": "File Type", "value": None}, + {"key": "File Path", "value": None} + ] + + self._log_info(f"Completed metadata enrichment for document {enriched_document.get('_id', 'unknown')}") + return enriched_document diff --git a/connectors/sources/sharepoint_online.py b/connectors/sources/sharepoint_online.py index 5e608e990..8a612e1a5 100644 --- a/connectors/sources/sharepoint_online.py +++ b/connectors/sources/sharepoint_online.py @@ -33,6 +33,7 @@ ) from connectors.logger import logger from connectors.source import CURSOR_SYNC_TIMESTAMP, BaseDataSource +from connectors.sources.sharepoint_metadata_enricher import SharePointMetadataEnricher from connectors.utils import ( TIKA_SUPPORTED_FILETYPES, CacheWithTimeout, @@ -1260,9 +1261,18 @@ def __init__(self, configuration): self._client = None self.site_group_cache = {} + self._metadata_enricher = None def _set_internal_logger(self): self.client.set_logger(self._logger) + # Initialize metadata enricher with logger + self._metadata_enricher = SharePointMetadataEnricher(logger=self._logger) + + @property + def metadata_enricher(self): + if not self._metadata_enricher: + self._metadata_enricher = SharePointMetadataEnricher(logger=self._logger) + return self._metadata_enricher @property def client(self): @@ -1495,210 +1505,9 @@ def _decorate_with_access_control(self, document, access_control): return document - def _extract_metadata_from_sharepoint_fields(self, document, site=None, site_drive=None, site_list=None): - """ - Extract metadata from SharePoint fields and map them to standardized metadata structure. - - Args: - document: The SharePoint document/item - site: Site information (optional) - site_drive: Drive information (optional) - site_list: List information (optional) - - Returns: - dict: Extracted metadata as key-value pairs - """ - metadata = {} - - - fields = document.get("fields", {}) - - # Map SharePoint managed properties to your metadata structure - # Based on your SEARCH Managed properties table - # Category mapping - odc_category = fields.get("OPDCategory") or fields.get("ODCCategory") - if odc_category: - metadata["Category"] = odc_category - else: - # Try to determine category from site URL or other indicators - if site and site.get("webUrl"): - site_url = site["webUrl"].lower() - if "odc" in site_url: - metadata["Category"] = "ODC" - - # Business Unit / Division - metadata["Division"] = fields.get("BusinessUnit") - metadata["Department"] = fields.get("BusinessUnit") # Using same field as Department - - # Document Type - metadata["Content-Type"] = fields.get("DocumentType") or self._determine_content_type(document) - - # Activity and Project information - metadata["ActivityID"] = fields.get("ActivityID") - metadata["ActivityName"] = fields.get("ActivityName") - metadata["ProjectID"] = fields.get("ProjectID") - metadata["ProjectType"] = fields.get("ProjectType") - - # Geographic and temporal metadata - metadata["Region"] = fields.get("Region") - metadata["FocusCountry"] = fields.get("FocusCountry") - metadata["Year"] = fields.get("Year") - metadata["Phase"] = fields.get("Phase") - - # Status and classification - metadata["Status"] = fields.get("OPDStatus") - metadata["GrantType"] = fields.get("GrantType") - metadata["GrantWindow"] = fields.get("GrantWindow") - - # Boolean flags - metadata["Disclosable"] = fields.get("Disclosable") - metadata["NonIFAD"] = fields.get("NonIfad") - metadata["PLF"] = fields.get("PLF") - - # System information - metadata["SystemSource"] = fields.get("ODCIntegration_SystemSource") - - elif "xdesk" in site_url: - metadata["Category"] = "Xdesk" - else: - metadata["Category"] = None - else: - metadata["Category"] = None - - - return metadata - - def _determine_content_type(self, document): - """ - Determine content type based on document properties. - - Args: - document: The SharePoint document - - Returns: - str: Content type classification - """ - object_type = document.get("object_type", "") - - if object_type == "drive_item": - name = document.get("name", "") - if "folder" in document: - return "Folder" - elif name: - ext = os.path.splitext(name)[-1].lower() - if ext in ['.ppt', '.pptx']: - return "Presentation" - elif ext in ['.doc', '.docx', '.pdf']: - return "Document" - elif ext in ['.xls', '.xlsx']: - return "Spreadsheet" - elif ext in ['.mp4', '.avi', '.mov']: - return "Video" - elif ext in ['.jpg', '.jpeg', '.png', '.gif']: - return "Image" - else: - return "Document" - elif object_type == "site_page": - return "Web Page" - elif object_type == "list_item": - return "List Item" - elif object_type == "list_item_attachment": - return "Attachment" - else: - return "Document" - - def _build_metadata_array(self, document, site=None, site_drive=None, site_list=None): - """ - Build metadata array as key-value pairs for the document. - - Args: - document: The SharePoint document - site: Site context - site_drive: Drive context - site_list: List context - - Returns: - list: Array of metadata key-value pairs - """ - metadata_pairs = [] - - # Extract SharePoint-specific metadata - sharepoint_metadata = self._extract_metadata_from_sharepoint_fields( - document, site, site_drive, site_list - ) - - # Standard metadata that should always be present - - # Site Name - site_name = None - if site: - site_name = site.get("displayName") or site.get("name") or site.get("title") - metadata_pairs.append({"key": "Site Name", "value": site_name}) - - # Document Library / Drive Name - library_name = None - if site_drive: - library_name = site_drive.get("name") or site_drive.get("displayName") - elif site_list: - library_name = site_list.get("name") or site_list.get("displayName") - metadata_pairs.append({"key": "Document Library", "value": library_name}) - - # File Type/Extension - file_extension = None - file_name = document.get("name") or document.get("_original_filename") or document.get("FileName", "") - if file_name and "." in file_name: - file_extension = os.path.splitext(file_name)[-1].lower() - metadata_pairs.append({"key": "File Type", "value": file_extension}) - - # File Path/Location - file_path = None - if document.get("webUrl"): - file_path = document["webUrl"] - elif document.get("parentReference", {}).get("path"): - file_path = document["parentReference"]["path"] - elif site and site.get("webUrl"): - # Construct path from site URL and document name - site_path = self._site_path_from_web_url(site["webUrl"]) - if file_name: - file_path = f"{site_path}/{file_name}" - else: - file_path = site_path - metadata_pairs.append({"key": "File Path", "value": file_path}) - - # Add all SharePoint metadata fields - for key, value in sharepoint_metadata.items(): - metadata_pairs.append({"key": key, "value": value}) - - # Additional technical metadata - metadata_pairs.append({"key": "Object Type", "value": document.get("object_type")}) - metadata_pairs.append({"key": "Document ID", "value": document.get("_id")}) - metadata_pairs.append({"key": "Last Modified", "value": document.get("_timestamp") or document.get("lastModifiedDateTime")}) - - # Size information for files - if document.get("size"): - metadata_pairs.append({"key": "File Size", "value": document.get("size")}) - - # Creator information - created_by = None - if document.get("createdBy", {}).get("user", {}).get("displayName"): - created_by = document["createdBy"]["user"]["displayName"] - elif document.get("createdBy", {}).get("user", {}).get("email"): - created_by = document["createdBy"]["user"]["email"] - metadata_pairs.append({"key": "Created By", "value": created_by}) - - # Modified by information - modified_by = None - if document.get("lastModifiedBy", {}).get("user", {}).get("displayName"): - modified_by = document["lastModifiedBy"]["user"]["displayName"] - elif document.get("lastModifiedBy", {}).get("user", {}).get("email"): - modified_by = document["lastModifiedBy"]["user"]["email"] - metadata_pairs.append({"key": "Modified By", "value": modified_by}) - - return metadata_pairs - def _enrich_document_with_metadata(self, document, site=None, site_drive=None, site_list=None): """ - Enrich document with metadata array. + Enrich document with metadata using the dedicated metadata enricher. Args: document: The document to enrich @@ -1709,22 +1518,14 @@ def _enrich_document_with_metadata(self, document, site=None, site_drive=None, s Returns: dict: Document enriched with metadata """ - # Check if metadata enrichment is enabled - if not self.configuration.get("enrich_metadata", True): - return document - - try: - metadata_array = self._build_metadata_array(document, site, site_drive, site_list) - document["metadata"] = metadata_array - - self._logger.debug(f"Enriched document {document.get('_id')} with {len(metadata_array)} metadata pairs") - - except Exception as e: - self._logger.warning(f"Failed to enrich document {document.get('_id')} with metadata: {str(e)}") - # Ensure at least an empty metadata array - document["metadata"] = [] - - return document + enrich_enabled = bool(self.configuration.get("enrich_metadata", True)) + return self.metadata_enricher.enrich_document_with_metadata( + document=document, + site=site, + site_drive=site_drive, + site_list=site_list, + enrich_metadata_enabled=enrich_enabled + ) async def _site_access_control(self, site): """Fetches all permissions for all owners, members and visitors of a given site. From b8c3e15dc643a6eea7ab3ebd54fa135cb5fc341a Mon Sep 17 00:00:00 2001 From: claudiu Date: Thu, 31 Jul 2025 12:38:22 +0300 Subject: [PATCH 8/9] Updated metadata enricher with new data --- .../sources/sharepoint_metadata_enricher.py | 200 +++++++++++++----- connectors/sources/sharepoint_online.py | 47 ++-- 2 files changed, 184 insertions(+), 63 deletions(-) diff --git a/connectors/sources/sharepoint_metadata_enricher.py b/connectors/sources/sharepoint_metadata_enricher.py index 19c2190cd..b8ea9eea5 100644 --- a/connectors/sources/sharepoint_metadata_enricher.py +++ b/connectors/sources/sharepoint_metadata_enricher.py @@ -1,11 +1,18 @@ import os from typing import Dict, List, Optional, Any +# ODC-specific managed properties for SharePoint Graph API calls +ODC_MANAGED_PROPERTIES = "ActivityID,BusinessUnit,OPDCategory,LOB,Division,DocumentType,FinancialYear,Quarter,Month,Owner,Reviewer,Approver,Status,Priority,Confidentiality,Retention,Compliance,RelatedProjects,Tags,Keywords,Notes" + +# Graph API URL constant +GRAPH_API_URL = "https://graph.microsoft.com/v1.0" + class SharePointMetadataEnricher: - def __init__(self, logger=None): + def __init__(self, logger=None, graph_api_client=None): self.logger = logger + self._graph_api_client = graph_api_client def _log_debug(self, message: str): if self.logger: @@ -19,6 +26,129 @@ def _log_warning(self, message: str): if self.logger: self.logger.warning(message) + def _is_odc_site(self, site): + """Check if site is an ODC site based on the official ODC site URLs.""" + if not site: + return False + + web_url = site.get("webUrl", "").lower() + + # Official ODC site URLs + odc_sites = [ + "aprop/", + "lacop", + "esaop", + "nenop", + "wcaop", + "epop" + ] + + # Check if the site URL matches any ODC site (with or without trailing slash) + for odc_site in odc_sites: + if f"/sites/{odc_site.rstrip('/')}" in web_url: + return True + + return False + + def get_odc_managed_properties(self): + """Get the ODC managed properties string for Graph API calls.""" + return ODC_MANAGED_PROPERTIES + + def should_include_odc_properties(self, site): + """Check if ODC properties should be included in Graph API calls for this site.""" + return self._is_odc_site(site) + + async def get_drive_list_mapping(self, site_id, site_drives_method, site_lists_method): + """ + Get mapping between drives and their corresponding SharePoint lists. + This is needed to fetch custom metadata for drive items. + """ + drive_list_mapping = {} + + try: + # Get all drives for the site + async for drive in site_drives_method(site_id): + drive_id = drive.get("id") + + # Get all lists for the site + async for site_list in site_lists_method(site_id): + list_id = site_list.get("id") + list_name = site_list.get("name") or site_list.get("displayName", "") + + # Try to match drive with list - document libraries are usually lists + # This is a heuristic approach - in reality the mapping can be complex + if ( + "document" in list_name.lower() or + "library" in list_name.lower() or + list_name.lower() in drive.get("name", "").lower() + ): + drive_list_mapping[drive_id] = list_id + self._log_info(f"Mapped drive '{drive.get('name')}' ({drive_id}) to list '{list_name}' ({list_id})") + break + + except Exception as e: + self._log_warning(f"Error creating drive-list mapping for site {site_id}: {str(e)}") + + return drive_list_mapping + + async def get_drive_item_list_fields(self, drive_id, item_id): + """ + Get custom metadata fields for a drive item via the listItem/fields endpoint. + This is the working approach that retrieves SharePoint custom metadata. + """ + if not self._graph_api_client: + self._log_warning("No Graph API client available for fetching metadata fields") + return {} + + try: + url = f"{GRAPH_API_URL}/drives/{drive_id}/items/{item_id}/listItem/fields" + response = await self._graph_api_client.fetch(url) + self._log_info(f"Retrieved {len(response)} custom fields for drive item {item_id}") + return response + + except Exception as e: + if "404" in str(e) or "NotFound" in str(e): + # Item might be deleted or not linked to a listItem + self._log_debug(f"No listItem/fields found for drive item {item_id} (404)") + else: + self._log_debug(f"Failed to get listItem/fields for item {item_id}: {str(e)}") + return {} + + async def enrich_drive_item_with_list_metadata(self, drive_item, site_id=None, drive_list_mapping=None): + """ + Enrich a drive item with custom metadata using the working listItem/fields approach. + """ + try: + # Get the drive ID and item ID + item_id = drive_item.get("id") + drive_id = None + + # Try to get drive ID from parentReference + parent_ref = drive_item.get("parentReference", {}) + if parent_ref: + drive_id = parent_ref.get("driveId") + + if not drive_id or not item_id: + self._log_debug(f"Missing drive_id or item_id for drive item {item_id}") + return drive_item + + # Use the working approach: /drives/{drive_id}/items/{item_id}/listItem/fields + custom_fields = await self.get_drive_item_list_fields(drive_id, item_id) + + if custom_fields: + enriched_item = drive_item.copy() + # Add the SharePoint list fields to the drive item + enriched_item["fields"] = custom_fields + self._log_info(f"Enriched drive item {item_id} with {len(custom_fields)} SharePoint fields from listItem/fields") + return enriched_item + else: + self._log_debug(f"No SharePoint listItem/fields found for drive item {item_id}") + + except Exception as e: + self._log_warning(f"Error enriching drive item {drive_item.get('id')} with listItem/fields metadata: {str(e)}") + + return drive_item + def _extract_metadata_from_sharepoint_fields( self, document: Dict[str, Any], @@ -26,32 +156,26 @@ def _extract_metadata_from_sharepoint_fields( site_drive: Optional[Dict[str, Any]] = None, site_list: Optional[Dict[str, Any]] = None ) -> Dict[str, Any]: - self._log_info(f"Starting metadata extraction for document {document.get('_id', 'unknown')}") + self._log_info(f"Extracting metadata for document {document.get('_id', 'unknown')}") metadata = {} fields = document.get("fields", {}) - self._log_debug(f"SharePoint fields for document {document.get('_id', 'unknown')}: {fields}") - self._log_info(f"Found {len(fields)} SharePoint fields for document {document.get('_id', 'unknown')}") - + # Determine category from site URL using ODC site detection if site and site.get("webUrl"): - site_url = site["webUrl"].lower() - if "odc" in site_url: + if self._is_odc_site(site): metadata["Category"] = "ODC" - self._log_info(f"Detected ODC category from site URL: {site_url}") - elif "xdesk" in site_url: - metadata["Category"] = "Xdesk" - self._log_info(f"Detected Xdesk category from site URL: {site_url}") else: - metadata["Category"] = None - self._log_info(f"No specific category detected from site URL: {site_url}") + site_url = site["webUrl"].lower() + if "xdesk" in site_url: + metadata["Category"] = "Xdesk" + else: + metadata["Category"] = "General" else: metadata["Category"] = None - self._log_info("No site URL available for category detection") metadata["Division"] = fields.get("BusinessUnit") metadata["Department"] = fields.get("BusinessUnit") - # Document Type metadata["Content-Type"] = fields.get("DocumentType") or self._determine_content_type(document) # Activity and Project information @@ -59,6 +183,9 @@ def _extract_metadata_from_sharepoint_fields( metadata["ActivityName"] = fields.get("ActivityName") metadata["ProjectID"] = fields.get("ProjectID") metadata["ProjectType"] = fields.get("ProjectType") + metadata["ProjectName"] = fields.get("ProjectName") + metadata["ProjectTitle"] = fields.get("ProjectTitle") + metadata["ProjectSector"] = fields.get("ProjectSector") # Geographic and temporal metadata metadata["Region"] = fields.get("Region") @@ -79,7 +206,7 @@ def _extract_metadata_from_sharepoint_fields( # System information metadata["SystemSource"] = fields.get("ODCIntegration_SystemSource") - # Additional common SharePoint fields that might be present + # Additional common SharePoint fields metadata["Title"] = fields.get("Title") metadata["Author"] = fields.get("Author") metadata["Editor"] = fields.get("Editor") @@ -94,9 +221,8 @@ def _extract_metadata_from_sharepoint_fields( odc_category = fields.get("OPDCategory") or fields.get("ODCCategory") if odc_category: metadata["Category"] = odc_category - self._log_info(f"Override category with ODC field value: {odc_category}") - self._log_info(f"Completed metadata extraction with {len(metadata)} fields for document {document.get('_id', 'unknown')}") + self._log_info(f"Extracted metadata with {len([v for v in metadata.values() if v is not None])} non-null fields") return metadata def _determine_content_type(self, document: Dict[str, Any]) -> str: @@ -143,8 +269,7 @@ def build_metadata_array( site_drive: Optional[Dict[str, Any]] = None, site_list: Optional[Dict[str, Any]] = None ) -> List[Dict[str, Any]]: - # Build metadata array as key-value pairs for the document - self._log_info(f"Starting to build metadata array for document {document.get('_id', 'unknown')}") + self._log_info(f"Building metadata array for document {document.get('_id', 'unknown')}") metadata_pairs = [] try: @@ -153,11 +278,7 @@ def build_metadata_array( document, site, site_drive, site_list ) - self._log_info(f"Building standard metadata pairs for document {document.get('_id', 'unknown')}") - # Standard metadata that should always be present - - # Category (required field) metadata_pairs.append({ "key": "Category", "value": sharepoint_metadata.get("Category") @@ -167,17 +288,14 @@ def build_metadata_array( site_name = None if site: site_name = site.get("displayName") or site.get("name") or site.get("title") - self._log_info(f"Found site name: {site_name}") metadata_pairs.append({"key": "Site Name", "value": site_name}) # Document Library / Drive Name library_name = None if site_drive: library_name = site_drive.get("name") or site_drive.get("displayName") - self._log_info(f"Found drive library: {library_name}") elif site_list: library_name = site_list.get("name") or site_list.get("displayName") - self._log_info(f"Found list library: {library_name}") metadata_pairs.append({"key": "Document Library", "value": library_name}) # Division and Department (required fields) @@ -201,31 +319,26 @@ def build_metadata_array( file_name = document.get("name") or document.get("_original_filename") or document.get("FileName", "") if file_name and "." in file_name: file_extension = os.path.splitext(file_name)[-1].lower() - self._log_info(f"Detected file extension: {file_extension} for file: {file_name}") metadata_pairs.append({"key": "File Type", "value": file_extension}) # File Path/Location file_path = None if document.get("webUrl"): file_path = document["webUrl"] - self._log_info(f"Using document webUrl as file path: {file_path}") elif document.get("parentReference", {}).get("path"): file_path = document["parentReference"]["path"] - self._log_info(f"Using parentReference path as file path: {file_path}") elif site and site.get("webUrl"): - # Construct path from site URL and document name site_path = self._site_path_from_web_url(site["webUrl"]) if file_name: file_path = f"{site_path}/{file_name}" else: file_path = site_path - self._log_info(f"Constructed file path from site URL: {file_path}") metadata_pairs.append({"key": "File Path", "value": file_path}) # Add all SharePoint metadata fields for all documents - self._log_info(f"Adding SharePoint-specific fields for document {document.get('_id', 'unknown')}") sharepoint_fields = [ "ActivityID", "ActivityName", "ProjectID", "ProjectType", + "ProjectName", "ProjectTitle", "ProjectSector", "Region", "FocusCountry", "Year", "Phase", "Status", "GrantType", "GrantWindow", "Disclosable", "NonIFAD", "PLF", "SystemSource", "Title", "Author", "Editor", @@ -233,19 +346,14 @@ def build_metadata_array( "ContentType", "FileType" ] - added_fields_count = 0 for field in sharepoint_fields: if field in sharepoint_metadata and sharepoint_metadata[field] is not None: metadata_pairs.append({ "key": field, "value": sharepoint_metadata[field] }) - added_fields_count += 1 - - self._log_info(f"Added {added_fields_count} SharePoint-specific fields to metadata") # Additional technical metadata - self._log_info(f"Adding technical metadata for document {document.get('_id', 'unknown')}") metadata_pairs.append({"key": "Object Type", "value": document.get("object_type")}) metadata_pairs.append({"key": "Document ID", "value": document.get("_id")}) metadata_pairs.append({ @@ -255,35 +363,29 @@ def build_metadata_array( # Size information for files if document.get("size"): - self._log_info(f"Found file size: {document.get('size')} bytes") metadata_pairs.append({"key": "File Size", "value": document.get("size")}) # Creator information created_by = None if document.get("createdBy", {}).get("user", {}).get("displayName"): created_by = document["createdBy"]["user"]["displayName"] - self._log_info(f"Found creator display name: {created_by}") elif document.get("createdBy", {}).get("user", {}).get("email"): created_by = document["createdBy"]["user"]["email"] - self._log_info(f"Found creator email: {created_by}") metadata_pairs.append({"key": "Created By", "value": created_by}) # Modified by information modified_by = None if document.get("lastModifiedBy", {}).get("user", {}).get("displayName"): modified_by = document["lastModifiedBy"]["user"]["displayName"] - self._log_info(f"Found last modifier display name: {modified_by}") elif document.get("lastModifiedBy", {}).get("user", {}).get("email"): modified_by = document["lastModifiedBy"]["user"]["email"] - self._log_info(f"Found last modifier email: {modified_by}") metadata_pairs.append({"key": "Modified By", "value": modified_by}) - self._log_info(f"Built {len(metadata_pairs)} metadata pairs for document {document.get('_id')}") + self._log_info(f"Built {len(metadata_pairs)} metadata pairs") except Exception as e: self._log_warning(f"Error building metadata array for document {document.get('_id')}: {str(e)}") # Return minimal metadata on error - self._log_info("Returning minimal metadata due to error") metadata_pairs = [ {"key": "Category", "value": None}, {"key": "Site Name", "value": None}, @@ -307,23 +409,20 @@ def enrich_document_with_metadata( ) -> Dict[str, Any]: if not enrich_metadata_enabled: - self._log_info(f"Metadata enrichment disabled for document {document.get('_id', 'unknown')}") return document - # Create a copy of the document to avoid modifying the original - self._log_info(f"Starting metadata enrichment for document {document.get('_id', 'unknown')}") + self._log_info(f"Enriching document {document.get('_id', 'unknown')} with metadata") enriched_document = document.copy() try: metadata_array = self.build_metadata_array(enriched_document, site, site_drive, site_list) enriched_document["metadata"] = metadata_array - self._log_info(f"Successfully enriched document {enriched_document.get('_id')} with {len(metadata_array)} metadata pairs") + self._log_info(f"Successfully enriched document with {len(metadata_array)} metadata pairs") except Exception as e: self._log_warning(f"Failed to enrich document {enriched_document.get('_id')} with metadata: {str(e)}") # Ensure at least an empty metadata array with required fields - self._log_info("Setting fallback metadata array with required fields") enriched_document["metadata"] = [ {"key": "Category", "value": None}, {"key": "Site Name", "value": None}, @@ -335,5 +434,4 @@ def enrich_document_with_metadata( {"key": "File Path", "value": None} ] - self._log_info(f"Completed metadata enrichment for document {enriched_document.get('_id', 'unknown')}") return enriched_document diff --git a/connectors/sources/sharepoint_online.py b/connectors/sources/sharepoint_online.py index 8a612e1a5..85576f51c 100644 --- a/connectors/sources/sharepoint_online.py +++ b/connectors/sources/sharepoint_online.py @@ -78,9 +78,6 @@ # Base fields for all drive items DRIVE_ITEMS_FIELDS = "id,content.downloadUrl,lastModifiedDateTime,lastModifiedBy,root,deleted,file,folder,package,name,webUrl,createdBy,createdDateTime,size,parentReference" -# Additional ODC-specific managed properties -ODC_MANAGED_PROPERTIES = "ActivityID,BusinessUnit,OPDCategory,LOB,Division,DocumentType,FinancialYear,Quarter,Month,Owner,Reviewer,Approver,Status,Priority,Confidentiality,Retention,Compliance,RelatedProjects,Tags,Keywords,Notes" - CURSOR_SITE_DRIVE_KEY = "site_drives" # Microsoft Graph API Delta constants @@ -890,13 +887,21 @@ async def drive_items_delta(self, url): if "value" in response and len(response["value"]) > 0: yield DriveItemsPage(response["value"], delta_link) - async def drive_items(self, drive_id, url=None, site=None): + async def drive_items(self, drive_id, url=None, site=None, metadata_enricher=None): # Build field list with conditional ODC properties fields = DRIVE_ITEMS_FIELDS # Add ODC managed properties if this is an ODC site - if site and self._is_odc_site(site): - fields = f"{DRIVE_ITEMS_FIELDS},{ODC_MANAGED_PROPERTIES}" + # Use metadata enricher's ODC detection if available, otherwise fall back to local method + is_odc = False + if metadata_enricher and hasattr(metadata_enricher, 'should_include_odc_properties'): + is_odc = metadata_enricher.should_include_odc_properties(site) + elif site: + is_odc = self._is_odc_site(site) + + if is_odc and metadata_enricher: + odc_properties = metadata_enricher.get_odc_managed_properties() + fields = f"{DRIVE_ITEMS_FIELDS},{odc_properties}" url = ( ( @@ -1265,13 +1270,19 @@ def __init__(self, configuration): def _set_internal_logger(self): self.client.set_logger(self._logger) - # Initialize metadata enricher with logger - self._metadata_enricher = SharePointMetadataEnricher(logger=self._logger) + # Initialize metadata enricher with logger and graph api client + self._metadata_enricher = SharePointMetadataEnricher( + logger=self._logger, + graph_api_client=self.client._graph_api_client + ) @property def metadata_enricher(self): if not self._metadata_enricher: - self._metadata_enricher = SharePointMetadataEnricher(logger=self._logger) + self._metadata_enricher = SharePointMetadataEnricher( + logger=self._logger, + graph_api_client=self.client._graph_api_client + ) return self._metadata_enricher @property @@ -1852,7 +1863,7 @@ async def get_docs(self, filtering=None): None, ) - async for page in self.client.drive_items(site_drive["id"], site=site): + async for page in self.client.drive_items(site_drive["id"], site=site, metadata_enricher=self.metadata_enricher): for drive_items_batch in iterable_batches_generator( page.items, SPO_API_MAX_BATCH_SIZE ): @@ -1867,6 +1878,12 @@ async def get_docs(self, filtering=None): "lastModifiedDateTime" ) + # HYBRID APPROACH: Enrich drive item with SharePoint list metadata using listItem/fields + if self.configuration.get("enrich_metadata", True): + drive_item = await self.metadata_enricher.enrich_drive_item_with_list_metadata( + drive_item, site["id"] + ) + # Enrich with metadata drive_item = self._enrich_document_with_metadata( drive_item, site=site, site_drive=site_drive @@ -1980,7 +1997,7 @@ async def get_docs_incrementally(self, sync_cursor, filtering=None): delta_link = self.get_drive_delta_link(site_drive["id"]) async for page in self.client.drive_items( - drive_id=site_drive["id"], url=delta_link, site=site + drive_id=site_drive["id"], url=delta_link, site=site, metadata_enricher=self.metadata_enricher ): for drive_items_batch in iterable_batches_generator( page.items, SPO_API_MAX_BATCH_SIZE @@ -1996,6 +2013,12 @@ async def get_docs_incrementally(self, sync_cursor, filtering=None): "lastModifiedDateTime" ) + # HYBRID APPROACH: Enrich drive item with SharePoint list metadata using listItem/fields + if self.configuration.get("enrich_metadata", True): + drive_item = await self.metadata_enricher.enrich_drive_item_with_list_metadata( + drive_item, site["id"] + ) + # Enrich with metadata drive_item = self._enrich_document_with_metadata( drive_item, site=site, site_drive=site_drive @@ -2199,7 +2222,7 @@ def _get_login_name(permissions, label): return self._decorate_with_access_control(drive_item, access_control) async def drive_items(self, site_drive, max_drive_item_age, site=None): - async for page in self.client.drive_items(site_drive["id"], site=site): + async for page in self.client.drive_items(site_drive["id"], site=site, metadata_enricher=self.metadata_enricher): for drive_item in page: drive_item["_id"] = drive_item["id"] drive_item["object_type"] = "drive_item" From d54027d812232a6706f5e8ad4bba1ef5ee05e079 Mon Sep 17 00:00:00 2001 From: ygdrax Date: Mon, 4 Aug 2025 16:52:59 +0300 Subject: [PATCH 9/9] Updated metadata structure and odc condition --- .../sources/sharepoint_metadata_enricher.py | 143 +++++++++++++++--- 1 file changed, 125 insertions(+), 18 deletions(-) diff --git a/connectors/sources/sharepoint_metadata_enricher.py b/connectors/sources/sharepoint_metadata_enricher.py index b8ea9eea5..47e10445d 100644 --- a/connectors/sources/sharepoint_metadata_enricher.py +++ b/connectors/sources/sharepoint_metadata_enricher.py @@ -2,7 +2,8 @@ from typing import Dict, List, Optional, Any # ODC-specific managed properties for SharePoint Graph API calls -ODC_MANAGED_PROPERTIES = "ActivityID,BusinessUnit,OPDCategory,LOB,Division,DocumentType,FinancialYear,Quarter,Month,Owner,Reviewer,Approver,Status,Priority,Confidentiality,Retention,Compliance,RelatedProjects,Tags,Keywords,Notes" +# Includes both ODC and ODP (Operations Document Portal) fields +ODC_MANAGED_PROPERTIES = "ActivityID,BusinessUnit,OPDCategory,LOB,Division,DocumentType,FinancialYear,Quarter,Month,Owner,Reviewer,Approver,Status,Priority,Confidentiality,Retention,Compliance,RelatedProjects,Tags,Keywords,Notes,ProjectID,ProjectName,ProjectTitle,ProjectType,ProjectSector,Phase,Region,Country,FocusCountry,GrantType,GrantWindow,GrantRecipient,Theme,ShortName,AllDocuments,Disclosable,Disclosed,NonIFAD,PLF,Sensitive,BorrowerID,CopyValidation,DocumentTypeID,ODCIntegration_CIMission" # Graph API URL constant GRAPH_API_URL = "https://graph.microsoft.com/v1.0" @@ -35,7 +36,7 @@ def _is_odc_site(self, site): # Official ODC site URLs odc_sites = [ - "aprop/", + "aprop", "lacop", "esaop", "nenop", @@ -161,6 +162,28 @@ def _extract_metadata_from_sharepoint_fields( metadata = {} fields = document.get("fields", {}) + # Helper function to extract value from complex SharePoint field structures + def extract_field_value(field_data): + if isinstance(field_data, dict): + # Handle managed metadata fields with Label/TermGuid structure + if "Label" in field_data: + return field_data["Label"] + # Handle lookup fields or other object structures + elif "DisplayName" in field_data: + return field_data["DisplayName"] + elif "Value" in field_data: + return field_data["Value"] + elif isinstance(field_data, list): + # Handle arrays of managed metadata or lookup fields + if field_data and isinstance(field_data[0], dict): + return [extract_field_value(item) for item in field_data] + else: + return field_data + else: + # Handle simple string/number/boolean values + return field_data + return None + # Determine category from site URL using ODC site detection if site and site.get("webUrl"): if self._is_odc_site(site): @@ -174,37 +197,78 @@ def _extract_metadata_from_sharepoint_fields( else: metadata["Category"] = None + # Core business metadata metadata["Division"] = fields.get("BusinessUnit") metadata["Department"] = fields.get("BusinessUnit") - metadata["Content-Type"] = fields.get("DocumentType") or self._determine_content_type(document) - # Activity and Project information - metadata["ActivityID"] = fields.get("ActivityID") - metadata["ActivityName"] = fields.get("ActivityName") - metadata["ProjectID"] = fields.get("ProjectID") + # Document type - handle both simple and complex field structures + doc_type = extract_field_value(fields.get("DocumentType")) + metadata["Content-Type"] = doc_type or self._determine_content_type(document) + + # Project information - handle complex field structures + project_id = extract_field_value(fields.get("ProjectID")) + if project_id: + metadata["ProjectID"] = project_id + else: + # Fallback to hidden field + metadata["ProjectID"] = fields.get("ProjectID_Hidden") + metadata["ProjectType"] = fields.get("ProjectType") metadata["ProjectName"] = fields.get("ProjectName") metadata["ProjectTitle"] = fields.get("ProjectTitle") metadata["ProjectSector"] = fields.get("ProjectSector") + # Handle ShortName and AllDocuments (project short names) + short_name = extract_field_value(fields.get("ShortName")) + all_documents = extract_field_value(fields.get("AllDocuments")) + metadata["ShortName"] = short_name + metadata["AllDocuments"] = all_documents + # Geographic and temporal metadata metadata["Region"] = fields.get("Region") - metadata["FocusCountry"] = fields.get("FocusCountry") + metadata["Country"] = fields.get("Country") + metadata["CountryID"] = fields.get("CountryID") + metadata["FocusCountryIDs"] = fields.get("FocusCountryIDs") + + # Handle complex FocusCountry field + focus_country = extract_field_value(fields.get("FocusCountry")) + metadata["FocusCountry"] = focus_country + metadata["Year"] = fields.get("Year") metadata["Phase"] = fields.get("Phase") + metadata["PhaseID"] = fields.get("PhaseID") - # Status and classification - metadata["Status"] = fields.get("OPDStatus") or fields.get("Status") + # Grant and financing information metadata["GrantType"] = fields.get("GrantType") metadata["GrantWindow"] = fields.get("GrantWindow") + grant_recipient = extract_field_value(fields.get("GrantRecipient")) + metadata["GrantRecipient"] = grant_recipient + metadata["BorrowerID"] = fields.get("BorrowerID") - # Boolean flags + # Themes and topics + themes = extract_field_value(fields.get("Theme")) + metadata["Theme"] = themes + + # Document classification and status metadata["Disclosable"] = fields.get("Disclosable") + metadata["Disclosed"] = fields.get("Disclosed") metadata["NonIFAD"] = fields.get("NonIfad") metadata["PLF"] = fields.get("PLF") + metadata["Sensitive"] = fields.get("Sensitive") + metadata["IsInDocSet"] = fields.get("IsInDocSet") + metadata["OPDIsLink"] = fields.get("OPDIsLink") + + # Validation and compliance + metadata["CopyValidation"] = fields.get("CopyValidation") + metadata["SentToRMS"] = fields.get("SentToRMS") - # System information + # System and integration fields + metadata["ODCIntegration_CIMission"] = fields.get("ODCIntegration_CIMission") metadata["SystemSource"] = fields.get("ODCIntegration_SystemSource") + metadata["DocumentTypeID"] = fields.get("DocumentTypeID") + + # Project reference field + metadata["Project"] = fields.get("Project") # Additional common SharePoint fields metadata["Title"] = fields.get("Title") @@ -217,6 +281,26 @@ def _extract_metadata_from_sharepoint_fields( metadata["ContentType"] = fields.get("ContentType") metadata["FileType"] = fields.get("File_x0020_Type") + # Document icon and size + metadata["DocIcon"] = fields.get("DocIcon") + metadata["FileSizeDisplay"] = fields.get("FileSizeDisplay") + + # Document ID and linking + dlc_doc_id = fields.get("_dlc_DocIdUrl") + if isinstance(dlc_doc_id, dict) and "Description" in dlc_doc_id: + metadata["DocumentID"] = dlc_doc_id["Description"] + metadata["DocumentIDUrl"] = dlc_doc_id.get("Url") + + # Version information + metadata["UIVersionString"] = fields.get("_UIVersionString") + + # Activity information (for ODC compatibility) + metadata["ActivityID"] = fields.get("ActivityID") + metadata["ActivityName"] = fields.get("ActivityName") + + # Legacy status field handling + metadata["Status"] = fields.get("OPDStatus") or fields.get("Status") + # Check for any OPD/ODC category fields odc_category = fields.get("OPDCategory") or fields.get("ODCCategory") if odc_category: @@ -337,13 +421,36 @@ def build_metadata_array( # Add all SharePoint metadata fields for all documents sharepoint_fields = [ + # Project and Activity Information "ActivityID", "ActivityName", "ProjectID", "ProjectType", - "ProjectName", "ProjectTitle", "ProjectSector", - "Region", "FocusCountry", "Year", "Phase", "Status", - "GrantType", "GrantWindow", "Disclosable", "NonIFAD", - "PLF", "SystemSource", "Title", "Author", "Editor", - "Created", "Modified", "FileLeafRef", "FileDirRef", - "ContentType", "FileType" + "ProjectName", "ProjectTitle", "ProjectSector", "Project", + "ShortName", "AllDocuments", + + # Geographic and Temporal + "Region", "Country", "CountryID", "FocusCountry", "FocusCountryIDs", + "Year", "Phase", "PhaseID", + + # Grant and Financing + "GrantType", "GrantWindow", "GrantRecipient", "BorrowerID", + + # Themes and Classification + "Theme", "Status", "DocumentTypeID", + + # Flags and Status + "Disclosable", "Disclosed", "NonIFAD", "PLF", "Sensitive", + "IsInDocSet", "OPDIsLink", + + # Validation and Compliance + "CopyValidation", "SentToRMS", + + # System and Integration + "SystemSource", "ODCIntegration_CIMission", + + # Standard SharePoint Fields + "Title", "Author", "Editor", "Created", "Modified", + "FileLeafRef", "FileDirRef", "ContentType", "FileType", + "DocIcon", "FileSizeDisplay", "DocumentID", "DocumentIDUrl", + "UIVersionString" ] for field in sharepoint_fields: