From 1eea22cbe39e478d1a5943ce5dad26d453f137a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lle=C3=AFr=20Borr=C3=A0s=20Metje?= <lleir@llegeix.me>
Date: Tue, 10 Jun 2025 09:55:36 +0200
Subject: [PATCH 1/9] HonorNonIndexedQueriesWarningMayFailRandomly header

---
 connectors/sources/sharepoint_online.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/connectors/sources/sharepoint_online.py b/connectors/sources/sharepoint_online.py
index 5786bcb4f..d90e2cadd 100644
--- a/connectors/sources/sharepoint_online.py
+++ b/connectors/sources/sharepoint_online.py
@@ -499,6 +499,14 @@ async def _get(self, absolute_url, retry_count=0):
         try:
             token = await self._api_token.get()
             headers = {"authorization": f"Bearer {token}"}
+
+            # If the absolute_url contains "/sites?expand=sites" then add header prefer: HonorNonIndexedQueriesWarningMayFailRandomly as per Microsoft Support to handle sites with many subsites
+            if "/sites?expand=sites" in absolute_url:
+                headers["prefer"] = "HonorNonIndexedQueriesWarningMayFailRandomly"
+                self._logger.info(
+                    f"Adding header prefer: HonorNonIndexedQueriesWarningMayFailRandomly to request {absolute_url}"
+                )
+
             self._logger.debug(f"Calling Sharepoint Endpoint: {absolute_url}")
 
             async with self._http_session.get(

From 90a0a11be11a9174d66ba47cc56fdcdb3ca5911c Mon Sep 17 00:00:00 2001
From: claudiu <c.colesnicencu@ifad.org>
Date: Tue, 8 Jul 2025 13:09:48 +0300
Subject: [PATCH 2/9] Handle Everyone Except External Users group

---
 connectors/sources/sharepoint_online.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/connectors/sources/sharepoint_online.py b/connectors/sources/sharepoint_online.py
index d90e2cadd..89d1ee638 100644
--- a/connectors/sources/sharepoint_online.py
+++ b/connectors/sources/sharepoint_online.py
@@ -2636,9 +2636,17 @@ async def _access_control_for_member(self, member):
         a reference to a group's owners, or an individual, and will act accordingly.
         :param member: The dict representing a generic SPO entity. May be a group or an individual
         :return: the access control list (ACL) for this "member"
+
+        Detect when a member has the login name: c:0-.f|rolemanager|spo-grid-all-users. 
+        Map it to a standard identifier in _allow_access_control, for example: group:EveryoneExceptExternalUsers
         """
         login_name = member.get("LoginName")
 
+        # Handle "Everyone Except External Users" group
+        if login_name == "c:0-.f|rolemanager|spo-grid-all-users":
+            self._logger.debug(f"Detected 'Everyone Except External Users' group: '{member.get('Title')}'.")
+            return ["group:EveryoneExceptExternalUsers"]
+
         # 'LoginName' looking like a group indicates a group
         is_group = (
             login_name.startswith("c:0o.c|federateddirectoryclaimprovider|")

From cce8f2ebd29f8416fc87a7080a19e1e6c23c4481 Mon Sep 17 00:00:00 2001
From: claudiu <c.colesnicencu@ifad.org>
Date: Tue, 8 Jul 2025 13:11:36 +0300
Subject: [PATCH 3/9] Updated comment

---
 connectors/sources/sharepoint_online.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/connectors/sources/sharepoint_online.py b/connectors/sources/sharepoint_online.py
index 89d1ee638..25f3da44e 100644
--- a/connectors/sources/sharepoint_online.py
+++ b/connectors/sources/sharepoint_online.py
@@ -2638,7 +2638,7 @@ async def _access_control_for_member(self, member):
         :return: the access control list (ACL) for this "member"
 
         Detect when a member has the login name: c:0-.f|rolemanager|spo-grid-all-users. 
-        Map it to a standard identifier in _allow_access_control, for example: group:EveryoneExceptExternalUsers
+        Map it to a standard identifier in _allow_access_control
         """
         login_name = member.get("LoginName")
 

From 9a4f14fcd4dccd1838f8c585fa45cb490daee2da Mon Sep 17 00:00:00 2001
From: claudiu <c.colesnicencu@ifad.org>
Date: Wed, 9 Jul 2025 18:25:28 +0300
Subject: [PATCH 4/9] Added test and updated the check for external users

---
 connectors/sources/sharepoint_online.py |  2 +-
 tests/sources/test_sharepoint_online.py | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/connectors/sources/sharepoint_online.py b/connectors/sources/sharepoint_online.py
index 25f3da44e..0cadc78d8 100644
--- a/connectors/sources/sharepoint_online.py
+++ b/connectors/sources/sharepoint_online.py
@@ -2643,7 +2643,7 @@ async def _access_control_for_member(self, member):
         login_name = member.get("LoginName")
 
         # Handle "Everyone Except External Users" group
-        if login_name == "c:0-.f|rolemanager|spo-grid-all-users":
+        if login_name and login_name.startswith("c:0-.f|rolemanager|spo-grid-all-users"):
             self._logger.debug(f"Detected 'Everyone Except External Users' group: '{member.get('Title')}'.")
             return ["group:EveryoneExceptExternalUsers"]
 
diff --git a/tests/sources/test_sharepoint_online.py b/tests/sources/test_sharepoint_online.py
index dea56c28f..401b5cca3 100644
--- a/tests/sources/test_sharepoint_online.py
+++ b/tests/sources/test_sharepoint_online.py
@@ -3672,6 +3672,18 @@ def test_prefix_user_id(self):
                 },
                 [_prefix_group(GROUP_ONE_ID)],
             ),
+            (
+                # Everyone Except External Users group (access control: mapped group identifier)
+                {
+                    "Member": {
+                        "odata.type": "SP.User",
+                        "LoginName": "c:0-.f|rolemanager|spo-grid-all-users",
+                        "Title": "Everyone except external users",
+                    },
+                    "RoleDefinitionBindings": READ_BINDING,
+                },
+                ["group:EveryoneExceptExternalUsers"],
+            ),
             (
                 # Unknown type (access control: nothing)
                 {

From fdecd34ff5bf286082c0fccd774a96905edc59ee Mon Sep 17 00:00:00 2001
From: claudiu <c.colesnicencu@ifad.org>
Date: Wed, 16 Jul 2025 13:30:51 +0300
Subject: [PATCH 5/9] Added new metadata from sharepoint

---
 connectors/sources/sharepoint_online.py | 315 +++++++++++++++++++++++-
 1 file changed, 306 insertions(+), 9 deletions(-)

diff --git a/connectors/sources/sharepoint_online.py b/connectors/sources/sharepoint_online.py
index 0cadc78d8..413ba846a 100644
--- a/connectors/sources/sharepoint_online.py
+++ b/connectors/sources/sharepoint_online.py
@@ -1400,6 +1400,15 @@ def get_default_configuration(cls):
                 "type": "bool",
                 "value": True,
             },
+            "enrich_metadata": {
+                "display": "toggle",
+                "label": "Enrich documents with metadata",
+                "order": 17,
+                "tooltip": "Enable this option to enrich all documents with structured metadata including category, division, content type, and other SharePoint managed properties. The metadata will be stored as an array of key-value pairs in a 'metadata' field.",
+                "type": "bool",
+                "value": True,
+                "ui_restrictions": ["advanced"],
+            },
         }
 
     async def validate_config(self):
@@ -1462,6 +1471,235 @@ def _decorate_with_access_control(self, document, access_control):
 
         return document
 
+    def _extract_metadata_from_sharepoint_fields(self, document, site=None, site_drive=None, site_list=None):
+        """
+        Extract metadata from SharePoint fields and map them to standardized metadata structure.
+        
+        Args:
+            document: The SharePoint document/item
+            site: Site information (optional)
+            site_drive: Drive information (optional) 
+            site_list: List information (optional)
+            
+        Returns:
+            dict: Extracted metadata as key-value pairs
+        """
+        metadata = {}
+        
+
+        fields = document.get("fields", {})
+        
+        # Map SharePoint managed properties to your metadata structure
+        # Based on your SEARCH Managed properties table
+        # Category mapping
+        odc_category = fields.get("OPDCategory") or fields.get("ODCCategory")
+        if odc_category:
+            metadata["Category"] = odc_category
+        else:
+            # Try to determine category from site URL or other indicators
+            if site and site.get("webUrl"):
+                site_url = site["webUrl"].lower()
+                if "odc" in site_url:
+                    metadata["Category"] = "ODC"
+                elif "xdesk" in site_url:
+                    metadata["Category"] = "Xdesk"
+                else:
+                    metadata["Category"] = None
+            else:
+                metadata["Category"] = None
+        
+        # Business Unit / Division
+        metadata["Division"] = fields.get("BusinessUnit")
+        metadata["Department"] = fields.get("BusinessUnit")  # Using same field as Department
+        
+        # Document Type
+        metadata["Content-Type"] = fields.get("DocumentType") or self._determine_content_type(document)
+        
+        # Activity and Project information
+        metadata["ActivityID"] = fields.get("ActivityID")
+        metadata["ActivityName"] = fields.get("ActivityName")
+        metadata["ProjectID"] = fields.get("ProjectID")
+        metadata["ProjectType"] = fields.get("ProjectType")
+        
+        # Geographic and temporal metadata
+        metadata["Region"] = fields.get("Region")
+        metadata["FocusCountry"] = fields.get("FocusCountry")
+        metadata["Year"] = fields.get("Year")
+        metadata["Phase"] = fields.get("Phase")
+        
+        # Status and classification
+        metadata["Status"] = fields.get("OPDStatus")
+        metadata["GrantType"] = fields.get("GrantType")
+        metadata["GrantWindow"] = fields.get("GrantWindow")
+        
+        # Boolean flags
+        metadata["Disclosable"] = fields.get("Disclosable")
+        metadata["NonIFAD"] = fields.get("NonIfad")
+        metadata["PLF"] = fields.get("PLF")
+        
+        # System information
+        metadata["SystemSource"] = fields.get("ODCIntegration_SystemSource")
+        
+        return metadata
+
+    def _determine_content_type(self, document):
+        """
+        Determine content type based on document properties.
+        
+        Args:
+            document: The SharePoint document
+            
+        Returns:
+            str: Content type classification
+        """
+        object_type = document.get("object_type", "")
+        
+        if object_type == "drive_item":
+            name = document.get("name", "")
+            if "folder" in document:
+                return "Folder"
+            elif name:
+                ext = os.path.splitext(name)[-1].lower()
+                if ext in ['.ppt', '.pptx']:
+                    return "Presentation"
+                elif ext in ['.doc', '.docx', '.pdf']:
+                    return "Document"
+                elif ext in ['.xls', '.xlsx']:
+                    return "Spreadsheet"
+                elif ext in ['.mp4', '.avi', '.mov']:
+                    return "Video"
+                elif ext in ['.jpg', '.jpeg', '.png', '.gif']:
+                    return "Image"
+                else:
+                    return "Document"
+        elif object_type == "site_page":
+            return "Web Page"
+        elif object_type == "list_item":
+            return "List Item"
+        elif object_type == "list_item_attachment":
+            return "Attachment"
+        else:
+            return "Document"
+
+    def _build_metadata_array(self, document, site=None, site_drive=None, site_list=None):
+        """
+        Build metadata array as key-value pairs for the document.
+        
+        Args:
+            document: The SharePoint document
+            site: Site context
+            site_drive: Drive context  
+            site_list: List context
+            
+        Returns:
+            list: Array of metadata key-value pairs
+        """
+        metadata_pairs = []
+        
+        # Extract SharePoint-specific metadata
+        sharepoint_metadata = self._extract_metadata_from_sharepoint_fields(
+            document, site, site_drive, site_list
+        )
+        
+        # Standard metadata that should always be present
+        
+        # Site Name
+        site_name = None
+        if site:
+            site_name = site.get("displayName") or site.get("name") or site.get("title")
+        metadata_pairs.append({"key": "Site Name", "value": site_name})
+        
+        # Document Library / Drive Name
+        library_name = None
+        if site_drive:
+            library_name = site_drive.get("name") or site_drive.get("displayName")
+        elif site_list:
+            library_name = site_list.get("name") or site_list.get("displayName")
+        metadata_pairs.append({"key": "Document Library", "value": library_name})
+        
+        # File Type/Extension
+        file_extension = None
+        file_name = document.get("name") or document.get("_original_filename") or document.get("FileName", "")
+        if file_name and "." in file_name:
+            file_extension = os.path.splitext(file_name)[-1].lower()
+        metadata_pairs.append({"key": "File Type", "value": file_extension})
+        
+        # File Path/Location
+        file_path = None
+        if document.get("webUrl"):
+            file_path = document["webUrl"]
+        elif document.get("parentReference", {}).get("path"):
+            file_path = document["parentReference"]["path"]
+        elif site and site.get("webUrl"):
+            # Construct path from site URL and document name
+            site_path = self._site_path_from_web_url(site["webUrl"])
+            if file_name:
+                file_path = f"{site_path}/{file_name}"
+            else:
+                file_path = site_path
+        metadata_pairs.append({"key": "File Path", "value": file_path})
+        
+        # Add all SharePoint metadata fields
+        for key, value in sharepoint_metadata.items():
+            metadata_pairs.append({"key": key, "value": value})
+        
+        # Additional technical metadata
+        metadata_pairs.append({"key": "Object Type", "value": document.get("object_type")})
+        metadata_pairs.append({"key": "Document ID", "value": document.get("_id")})
+        metadata_pairs.append({"key": "Last Modified", "value": document.get("_timestamp") or document.get("lastModifiedDateTime")})
+        
+        # Size information for files
+        if document.get("size"):
+            metadata_pairs.append({"key": "File Size", "value": document.get("size")})
+        
+        # Creator information
+        created_by = None
+        if document.get("createdBy", {}).get("user", {}).get("displayName"):
+            created_by = document["createdBy"]["user"]["displayName"]
+        elif document.get("createdBy", {}).get("user", {}).get("email"):
+            created_by = document["createdBy"]["user"]["email"]
+        metadata_pairs.append({"key": "Created By", "value": created_by})
+        
+        # Modified by information  
+        modified_by = None
+        if document.get("lastModifiedBy", {}).get("user", {}).get("displayName"):
+            modified_by = document["lastModifiedBy"]["user"]["displayName"]
+        elif document.get("lastModifiedBy", {}).get("user", {}).get("email"):
+            modified_by = document["lastModifiedBy"]["user"]["email"]
+        metadata_pairs.append({"key": "Modified By", "value": modified_by})
+        
+        return metadata_pairs
+
+    def _enrich_document_with_metadata(self, document, site=None, site_drive=None, site_list=None):
+        """
+        Enrich document with metadata array.
+        
+        Args:
+            document: The document to enrich
+            site: Site context
+            site_drive: Drive context
+            site_list: List context
+            
+        Returns:
+            dict: Document enriched with metadata
+        """
+        # Check if metadata enrichment is enabled
+        if not self.configuration.get("enrich_metadata", True):
+            return document
+            
+        try:
+            metadata_array = self._build_metadata_array(document, site, site_drive, site_list)
+            document["metadata"] = metadata_array
+            
+            self._logger.debug(f"Enriched document {document.get('_id')} with {len(metadata_array)} metadata pairs")
+            
+        except Exception as e:
+            self._logger.warning(f"Failed to enrich document {document.get('_id')} with metadata: {str(e)}")
+            # Ensure at least an empty metadata array
+            document["metadata"] = []
+        
+        return document
+
     async def _site_access_control(self, site):
         """Fetches all permissions for all owners, members and visitors of a given site.
         All groups and/or persons, which have permissions for a given site are returned with their given identity prefix ("user", "group" or "email").
@@ -1751,6 +1989,8 @@ async def get_docs(self, filtering=None):
             max_drive_item_age = advanced_rules["skipExtractingDriveItemsOlderThan"]
 
         async for site_collection in self.site_collections():
+            # Enrich site collection with metadata
+            site_collection = self._enrich_document_with_metadata(site_collection)
             yield site_collection, None
 
             async for site in self.sites(
@@ -1762,16 +2002,26 @@ async def get_docs(self, filtering=None):
                     site_admin_access_control,
                 ) = await self._site_access_control(site)
 
+                # Enrich site with metadata and access control
+                enriched_site = self._enrich_document_with_metadata(site)
+                enriched_site = self._decorate_with_access_control(enriched_site, site_access_control)
+
                 yield (
-                    self._decorate_with_access_control(site, site_access_control),
+                    enriched_site,
                     None,
                 )
 
                 async for site_drive in self.site_drives(site):
+                    # Enrich site drive with metadata and access control
+                    enriched_site_drive = self._enrich_document_with_metadata(
+                        site_drive, site=site, site_drive=site_drive
+                    )
+                    enriched_site_drive = self._decorate_with_access_control(
+                        enriched_site_drive, site_access_control
+                    )
+                    
                     yield (
-                        self._decorate_with_access_control(
-                            site_drive, site_access_control
-                        ),
+                        enriched_site_drive,
                         None,
                     )
 
@@ -1790,6 +2040,11 @@ async def get_docs(self, filtering=None):
                                     "lastModifiedDateTime"
                                 )
 
+                                # Enrich with metadata
+                                drive_item = self._enrich_document_with_metadata(
+                                    drive_item, site=site, site_drive=site_drive
+                                )
+
                                 # Drive items should inherit site access controls only if
                                 # 'fetch_drive_item_permissions' is disabled in the config
                                 if not self.configuration[
@@ -1853,6 +2108,8 @@ async def get_docs_incrementally(self, sync_cursor, filtering=None):
             max_drive_item_age = advanced_rules["skipExtractingDriveItemsOlderThan"]
 
         async for site_collection in self.site_collections():
+            # Enrich site collection with metadata
+            site_collection = self._enrich_document_with_metadata(site_collection)
             yield site_collection, None, OP_INDEX
 
             async for site in self.sites(
@@ -1865,8 +2122,12 @@ async def get_docs_incrementally(self, sync_cursor, filtering=None):
                     site_admin_access_control,
                 ) = await self._site_access_control(site)
 
+                # Enrich site with metadata and access control
+                enriched_site = self._enrich_document_with_metadata(site)
+                enriched_site = self._decorate_with_access_control(enriched_site, site_access_control)
+
                 yield (
-                    self._decorate_with_access_control(site, site_access_control),
+                    enriched_site,
                     None,
                     OP_INDEX,
                 )
@@ -1875,10 +2136,16 @@ async def get_docs_incrementally(self, sync_cursor, filtering=None):
                 # lastModifiedDateTime of the parent site_drive. Therefore, we
                 # set check_timestamp to False when iterating over site_drives.
                 async for site_drive in self.site_drives(site, check_timestamp=False):
+                    # Enrich site drive with metadata and access control
+                    enriched_site_drive = self._enrich_document_with_metadata(
+                        site_drive, site=site, site_drive=site_drive
+                    )
+                    enriched_site_drive = self._decorate_with_access_control(
+                        enriched_site_drive, site_access_control
+                    )
+                    
                     yield (
-                        self._decorate_with_access_control(
-                            site_drive, site_access_control
-                        ),
+                        enriched_site_drive,
                         None,
                         OP_INDEX,
                     )
@@ -1902,6 +2169,11 @@ async def get_docs_incrementally(self, sync_cursor, filtering=None):
                                     "lastModifiedDateTime"
                                 )
 
+                                # Enrich with metadata
+                                drive_item = self._enrich_document_with_metadata(
+                                    drive_item, site=site, site_drive=site_drive
+                                )
+
                                 # Drive items should inherit site access controls only if
                                 # 'fetch_drive_item_permissions' is disabled in the config
                                 if not self.configuration[
@@ -2099,13 +2371,18 @@ def _get_login_name(permissions, label):
 
         return self._decorate_with_access_control(drive_item, access_control)
 
-    async def drive_items(self, site_drive, max_drive_item_age):
+    async def drive_items(self, site_drive, max_drive_item_age, site=None):
         async for page in self.client.drive_items(site_drive["id"]):
             for drive_item in page:
                 drive_item["_id"] = drive_item["id"]
                 drive_item["object_type"] = "drive_item"
                 drive_item["_timestamp"] = drive_item["lastModifiedDateTime"]
 
+                # Enrich with metadata
+                drive_item = self._enrich_document_with_metadata(
+                    drive_item, site=site, site_drive=site_drive
+                )
+
                 yield drive_item, self.download_function(drive_item, max_drive_item_age)
 
     async def site_list_items(
@@ -2215,11 +2492,21 @@ async def site_list_items(
                                 ACCESS_CONTROL, []
                             )
 
+                        # Enrich attachment with metadata before yielding
+                        list_item_attachment = self._enrich_document_with_metadata(
+                            list_item_attachment, site=site, site_list={"id": site_list_id, "name": site_list_name}
+                        )
+
                         attachment_download_func = partial(
                             self.get_attachment_content, list_item_attachment
                         )
                         yield list_item_attachment, attachment_download_func
 
+                # Enrich list item with metadata before yielding
+                list_item = self._enrich_document_with_metadata(
+                    list_item, site=site, site_list={"id": site_list_id, "name": site_list_name}
+                )
+
                 yield list_item, None
 
     async def site_lists(self, site, site_access_control, check_timestamp=False):
@@ -2272,6 +2559,11 @@ async def site_lists(self, site, site_access_control, check_timestamp=False):
                         site_list, site_access_control
                     )
 
+                # Enrich site list with metadata before yielding
+                site_list = self._enrich_document_with_metadata(
+                    site_list, site=site, site_list=site_list
+                )
+
                 yield site_list
 
     async def _get_access_control_from_role_assignment(self, role_assignment):
@@ -2407,6 +2699,11 @@ async def site_pages(self, site, site_access_control, check_timestamp=False):
                     if html_field in site_page:
                         site_page[html_field] = html_to_text(site_page[html_field])
 
+                # Enrich site page with metadata before yielding
+                site_page = self._enrich_document_with_metadata(
+                    site_page, site=site
+                )
+
                 yield site_page
 
     def init_sync_cursor(self):

From 3946e62aed073206e23067e6c79c585222d112c0 Mon Sep 17 00:00:00 2001
From: claudiu <c.colesnicencu@ifad.org>
Date: Thu, 17 Jul 2025 10:17:01 +0300
Subject: [PATCH 6/9] Added ODC_MANAGED_PROPERTIES that append the
 DRIVE_ITEMS_FIELDS for the ODC category

---
 connectors/sources/sharepoint_online.py | 100 +++++++++++++++---------
 1 file changed, 63 insertions(+), 37 deletions(-)

diff --git a/connectors/sources/sharepoint_online.py b/connectors/sources/sharepoint_online.py
index 413ba846a..5e608e990 100644
--- a/connectors/sources/sharepoint_online.py
+++ b/connectors/sources/sharepoint_online.py
@@ -74,8 +74,12 @@
 FILE_WRITE_CHUNK_SIZE = 1024 * 64  # 64KB default SSD page size
 MAX_DOCUMENT_SIZE = 10485760
 WILDCARD = "*"
+# Base fields for all drive items
 DRIVE_ITEMS_FIELDS = "id,content.downloadUrl,lastModifiedDateTime,lastModifiedBy,root,deleted,file,folder,package,name,webUrl,createdBy,createdDateTime,size,parentReference"
 
+# Additional ODC-specific managed properties
+ODC_MANAGED_PROPERTIES = "ActivityID,BusinessUnit,OPDCategory,LOB,Division,DocumentType,FinancialYear,Quarter,Month,Owner,Reviewer,Approver,Status,Priority,Confidentiality,Retention,Compliance,RelatedProjects,Tags,Keywords,Notes"
+
 CURSOR_SITE_DRIVE_KEY = "site_drives"
 
 # Microsoft Graph API Delta constants
@@ -885,10 +889,17 @@ async def drive_items_delta(self, url):
             if "value" in response and len(response["value"]) > 0:
                 yield DriveItemsPage(response["value"], delta_link)
 
-    async def drive_items(self, drive_id, url=None):
+    async def drive_items(self, drive_id, url=None, site=None):
+        # Build field list with conditional ODC properties
+        fields = DRIVE_ITEMS_FIELDS
+        
+        # Add ODC managed properties if this is an ODC site
+        if site and self._is_odc_site(site):
+            fields = f"{DRIVE_ITEMS_FIELDS},{ODC_MANAGED_PROPERTIES}"
+        
         url = (
             (
-                f"{GRAPH_API_URL}/drives/{drive_id}/root/delta?$select={DRIVE_ITEMS_FIELDS}"
+                f"{GRAPH_API_URL}/drives/{drive_id}/root/delta?$select={fields}"
             )
             if not url
             else url
@@ -897,6 +908,19 @@ async def drive_items(self, drive_id, url=None):
         async for page in self.drive_items_delta(url):
             yield page
 
+    def _is_odc_site(self, site):
+        """Check if site is an ODC (Office Development Center) site based on URL or name patterns."""
+        if not site:
+            return False
+            
+        web_url = site.get("webUrl", "").lower()
+        site_name = site.get("name", "").lower()
+        
+        # Check for ODC indicators in URL or site name
+        odc_indicators = ["odc", "office-development", "dev-center", "development-center"]
+        
+        return any(indicator in web_url or indicator in site_name for indicator in odc_indicators)
+
     async def drive_items_permissions_batch(self, drive_id, drive_item_ids):
         requests = []
 
@@ -1501,6 +1525,39 @@ def _extract_metadata_from_sharepoint_fields(self, document, site=None, site_dri
                 site_url = site["webUrl"].lower()
                 if "odc" in site_url:
                     metadata["Category"] = "ODC"
+
+                    # Business Unit / Division
+                    metadata["Division"] = fields.get("BusinessUnit")
+                    metadata["Department"] = fields.get("BusinessUnit")  # Using same field as Department
+                    
+                    # Document Type
+                    metadata["Content-Type"] = fields.get("DocumentType") or self._determine_content_type(document)
+                    
+                    # Activity and Project information
+                    metadata["ActivityID"] = fields.get("ActivityID")
+                    metadata["ActivityName"] = fields.get("ActivityName")
+                    metadata["ProjectID"] = fields.get("ProjectID")
+                    metadata["ProjectType"] = fields.get("ProjectType")
+                    
+                    # Geographic and temporal metadata
+                    metadata["Region"] = fields.get("Region")
+                    metadata["FocusCountry"] = fields.get("FocusCountry")
+                    metadata["Year"] = fields.get("Year")
+                    metadata["Phase"] = fields.get("Phase")
+                    
+                    # Status and classification
+                    metadata["Status"] = fields.get("OPDStatus")
+                    metadata["GrantType"] = fields.get("GrantType")
+                    metadata["GrantWindow"] = fields.get("GrantWindow")
+                    
+                    # Boolean flags
+                    metadata["Disclosable"] = fields.get("Disclosable")
+                    metadata["NonIFAD"] = fields.get("NonIfad")
+                    metadata["PLF"] = fields.get("PLF")
+                    
+                    # System information
+                    metadata["SystemSource"] = fields.get("ODCIntegration_SystemSource")
+        
                 elif "xdesk" in site_url:
                     metadata["Category"] = "Xdesk"
                 else:
@@ -1508,38 +1565,7 @@ def _extract_metadata_from_sharepoint_fields(self, document, site=None, site_dri
             else:
                 metadata["Category"] = None
         
-        # Business Unit / Division
-        metadata["Division"] = fields.get("BusinessUnit")
-        metadata["Department"] = fields.get("BusinessUnit")  # Using same field as Department
-        
-        # Document Type
-        metadata["Content-Type"] = fields.get("DocumentType") or self._determine_content_type(document)
-        
-        # Activity and Project information
-        metadata["ActivityID"] = fields.get("ActivityID")
-        metadata["ActivityName"] = fields.get("ActivityName")
-        metadata["ProjectID"] = fields.get("ProjectID")
-        metadata["ProjectType"] = fields.get("ProjectType")
-        
-        # Geographic and temporal metadata
-        metadata["Region"] = fields.get("Region")
-        metadata["FocusCountry"] = fields.get("FocusCountry")
-        metadata["Year"] = fields.get("Year")
-        metadata["Phase"] = fields.get("Phase")
-        
-        # Status and classification
-        metadata["Status"] = fields.get("OPDStatus")
-        metadata["GrantType"] = fields.get("GrantType")
-        metadata["GrantWindow"] = fields.get("GrantWindow")
-        
-        # Boolean flags
-        metadata["Disclosable"] = fields.get("Disclosable")
-        metadata["NonIFAD"] = fields.get("NonIfad")
-        metadata["PLF"] = fields.get("PLF")
-        
-        # System information
-        metadata["SystemSource"] = fields.get("ODCIntegration_SystemSource")
-        
+  
         return metadata
 
     def _determine_content_type(self, document):
@@ -2025,7 +2051,7 @@ async def get_docs(self, filtering=None):
                         None,
                     )
 
-                    async for page in self.client.drive_items(site_drive["id"]):
+                    async for page in self.client.drive_items(site_drive["id"], site=site):
                         for drive_items_batch in iterable_batches_generator(
                             page.items, SPO_API_MAX_BATCH_SIZE
                         ):
@@ -2153,7 +2179,7 @@ async def get_docs_incrementally(self, sync_cursor, filtering=None):
                     delta_link = self.get_drive_delta_link(site_drive["id"])
 
                     async for page in self.client.drive_items(
-                        drive_id=site_drive["id"], url=delta_link
+                        drive_id=site_drive["id"], url=delta_link, site=site
                     ):
                         for drive_items_batch in iterable_batches_generator(
                             page.items, SPO_API_MAX_BATCH_SIZE
@@ -2372,7 +2398,7 @@ def _get_login_name(permissions, label):
         return self._decorate_with_access_control(drive_item, access_control)
 
     async def drive_items(self, site_drive, max_drive_item_age, site=None):
-        async for page in self.client.drive_items(site_drive["id"]):
+        async for page in self.client.drive_items(site_drive["id"], site=site):
             for drive_item in page:
                 drive_item["_id"] = drive_item["id"]
                 drive_item["object_type"] = "drive_item"

From e60dbf2ac8b02526ca31ff9853d84a322c37f4f1 Mon Sep 17 00:00:00 2001
From: claudiu <claudiu>
Date: Tue, 29 Jul 2025 16:49:29 +0300
Subject: [PATCH 7/9] Moved the enrich metadata code onto a different file

---
 .../sources/sharepoint_metadata_enricher.py   | 339 ++++++++++++++++++
 connectors/sources/sharepoint_online.py       | 237 +-----------
 2 files changed, 358 insertions(+), 218 deletions(-)
 create mode 100644 connectors/sources/sharepoint_metadata_enricher.py

diff --git a/connectors/sources/sharepoint_metadata_enricher.py b/connectors/sources/sharepoint_metadata_enricher.py
new file mode 100644
index 000000000..19c2190cd
--- /dev/null
+++ b/connectors/sources/sharepoint_metadata_enricher.py
@@ -0,0 +1,339 @@
+import os
+from typing import Dict, List, Optional, Any
+
+
+class SharePointMetadataEnricher:
+
+    def __init__(self, logger=None):
+        self.logger = logger
+
+    def _log_debug(self, message: str):
+        if self.logger:
+            self.logger.debug(message)
+
+    def _log_info(self, message: str):
+        if self.logger:
+            self.logger.info(message)
+
+    def _log_warning(self, message: str):
+        if self.logger:
+            self.logger.warning(message)
+
+    def _extract_metadata_from_sharepoint_fields(
+        self, 
+        document: Dict[str, Any], 
+        site: Optional[Dict[str, Any]] = None, 
+        site_drive: Optional[Dict[str, Any]] = None, 
+        site_list: Optional[Dict[str, Any]] = None
+    ) -> Dict[str, Any]:
+        self._log_info(f"Starting metadata extraction for document {document.get('_id', 'unknown')}")
+        
+        metadata = {}
+        fields = document.get("fields", {})
+        
+        self._log_debug(f"SharePoint fields for document {document.get('_id', 'unknown')}: {fields}")
+        self._log_info(f"Found {len(fields)} SharePoint fields for document {document.get('_id', 'unknown')}")
+        
+        if site and site.get("webUrl"):
+            site_url = site["webUrl"].lower()
+            if "odc" in site_url:
+                metadata["Category"] = "ODC"
+                self._log_info(f"Detected ODC category from site URL: {site_url}")
+            elif "xdesk" in site_url:
+                metadata["Category"] = "Xdesk"
+                self._log_info(f"Detected Xdesk category from site URL: {site_url}")
+            else:
+                metadata["Category"] = None
+                self._log_info(f"No specific category detected from site URL: {site_url}")
+        else:
+            metadata["Category"] = None
+            self._log_info("No site URL available for category detection")
+
+        metadata["Division"] = fields.get("BusinessUnit")
+        metadata["Department"] = fields.get("BusinessUnit") 
+        # Document Type
+        metadata["Content-Type"] = fields.get("DocumentType") or self._determine_content_type(document)
+        
+        # Activity and Project information
+        metadata["ActivityID"] = fields.get("ActivityID")
+        metadata["ActivityName"] = fields.get("ActivityName")
+        metadata["ProjectID"] = fields.get("ProjectID")
+        metadata["ProjectType"] = fields.get("ProjectType")
+        
+        # Geographic and temporal metadata
+        metadata["Region"] = fields.get("Region")
+        metadata["FocusCountry"] = fields.get("FocusCountry")
+        metadata["Year"] = fields.get("Year")
+        metadata["Phase"] = fields.get("Phase")
+        
+        # Status and classification
+        metadata["Status"] = fields.get("OPDStatus") or fields.get("Status")
+        metadata["GrantType"] = fields.get("GrantType")
+        metadata["GrantWindow"] = fields.get("GrantWindow")
+        
+        # Boolean flags
+        metadata["Disclosable"] = fields.get("Disclosable")
+        metadata["NonIFAD"] = fields.get("NonIfad")
+        metadata["PLF"] = fields.get("PLF")
+        
+        # System information
+        metadata["SystemSource"] = fields.get("ODCIntegration_SystemSource")
+        
+        # Additional common SharePoint fields that might be present
+        metadata["Title"] = fields.get("Title")
+        metadata["Author"] = fields.get("Author")
+        metadata["Editor"] = fields.get("Editor")
+        metadata["Created"] = fields.get("Created")
+        metadata["Modified"] = fields.get("Modified")
+        metadata["FileLeafRef"] = fields.get("FileLeafRef")
+        metadata["FileDirRef"] = fields.get("FileDirRef")
+        metadata["ContentType"] = fields.get("ContentType")
+        metadata["FileType"] = fields.get("File_x0020_Type")
+        
+        # Check for any OPD/ODC category fields
+        odc_category = fields.get("OPDCategory") or fields.get("ODCCategory")
+        if odc_category:
+            metadata["Category"] = odc_category
+            self._log_info(f"Override category with ODC field value: {odc_category}")
+
+        self._log_info(f"Completed metadata extraction with {len(metadata)} fields for document {document.get('_id', 'unknown')}")
+        return metadata
+
+    def _determine_content_type(self, document: Dict[str, Any]) -> str:
+        object_type = document.get("object_type", "")
+        
+        if object_type == "drive_item":
+            name = document.get("name", "")
+            if "folder" in document:
+                return "Folder"
+            elif name:
+                ext = os.path.splitext(name)[-1].lower()
+                if ext in ['.ppt', '.pptx']:
+                    return "Presentation"
+                elif ext in ['.doc', '.docx', '.pdf']:
+                    return "Document"
+                elif ext in ['.xls', '.xlsx']:
+                    return "Spreadsheet"
+                elif ext in ['.mp4', '.avi', '.mov']:
+                    return "Video"
+                elif ext in ['.jpg', '.jpeg', '.png', '.gif']:
+                    return "Image"
+                else:
+                    return "Document"
+            else:
+                return "Document"
+        elif object_type == "site_page":
+            return "Web Page"
+        elif object_type == "list_item":
+            return "List Item"
+        elif object_type == "list_item_attachment":
+            return "Attachment"
+        else:
+            return "Document"
+
+    def _site_path_from_web_url(self, web_url: str) -> str:
+        url_parts = web_url.split("/sites/")
+        site_path_parts = url_parts[1:]
+        return "/sites/".join(site_path_parts)
+
+    def build_metadata_array(
+        self, 
+        document: Dict[str, Any], 
+        site: Optional[Dict[str, Any]] = None, 
+        site_drive: Optional[Dict[str, Any]] = None, 
+        site_list: Optional[Dict[str, Any]] = None
+    ) -> List[Dict[str, Any]]:
+        # Build metadata array as key-value pairs for the document
+        self._log_info(f"Starting to build metadata array for document {document.get('_id', 'unknown')}")
+        metadata_pairs = []
+        
+        try:
+            # Extract SharePoint-specific metadata
+            sharepoint_metadata = self._extract_metadata_from_sharepoint_fields(
+                document, site, site_drive, site_list
+            )
+            
+            self._log_info(f"Building standard metadata pairs for document {document.get('_id', 'unknown')}")
+            
+            # Standard metadata that should always be present
+            
+            # Category (required field)
+            metadata_pairs.append({
+                "key": "Category", 
+                "value": sharepoint_metadata.get("Category")
+            })
+            
+            # Site Name
+            site_name = None
+            if site:
+                site_name = site.get("displayName") or site.get("name") or site.get("title")
+                self._log_info(f"Found site name: {site_name}")
+            metadata_pairs.append({"key": "Site Name", "value": site_name})
+            
+            # Document Library / Drive Name
+            library_name = None
+            if site_drive:
+                library_name = site_drive.get("name") or site_drive.get("displayName")
+                self._log_info(f"Found drive library: {library_name}")
+            elif site_list:
+                library_name = site_list.get("name") or site_list.get("displayName")
+                self._log_info(f"Found list library: {library_name}")
+            metadata_pairs.append({"key": "Document Library", "value": library_name})
+            
+            # Division and Department (required fields)
+            metadata_pairs.append({
+                "key": "Division", 
+                "value": sharepoint_metadata.get("Division")
+            })
+            metadata_pairs.append({
+                "key": "Department", 
+                "value": sharepoint_metadata.get("Department")
+            })
+            
+            # Content-Type (required field)
+            metadata_pairs.append({
+                "key": "Content-Type", 
+                "value": sharepoint_metadata.get("Content-Type")
+            })
+            
+            # File Type/Extension
+            file_extension = None
+            file_name = document.get("name") or document.get("_original_filename") or document.get("FileName", "")
+            if file_name and "." in file_name:
+                file_extension = os.path.splitext(file_name)[-1].lower()
+                self._log_info(f"Detected file extension: {file_extension} for file: {file_name}")
+            metadata_pairs.append({"key": "File Type", "value": file_extension})
+            
+            # File Path/Location
+            file_path = None
+            if document.get("webUrl"):
+                file_path = document["webUrl"]
+                self._log_info(f"Using document webUrl as file path: {file_path}")
+            elif document.get("parentReference", {}).get("path"):
+                file_path = document["parentReference"]["path"]
+                self._log_info(f"Using parentReference path as file path: {file_path}")
+            elif site and site.get("webUrl"):
+                # Construct path from site URL and document name
+                site_path = self._site_path_from_web_url(site["webUrl"])
+                if file_name:
+                    file_path = f"{site_path}/{file_name}"
+                else:
+                    file_path = site_path
+                self._log_info(f"Constructed file path from site URL: {file_path}")
+            metadata_pairs.append({"key": "File Path", "value": file_path})
+            
+            # Add all SharePoint metadata fields for all documents
+            self._log_info(f"Adding SharePoint-specific fields for document {document.get('_id', 'unknown')}")
+            sharepoint_fields = [
+                "ActivityID", "ActivityName", "ProjectID", "ProjectType",
+                "Region", "FocusCountry", "Year", "Phase", "Status",
+                "GrantType", "GrantWindow", "Disclosable", "NonIFAD", 
+                "PLF", "SystemSource", "Title", "Author", "Editor",
+                "Created", "Modified", "FileLeafRef", "FileDirRef",
+                "ContentType", "FileType"
+            ]
+            
+            added_fields_count = 0
+            for field in sharepoint_fields:
+                if field in sharepoint_metadata and sharepoint_metadata[field] is not None:
+                    metadata_pairs.append({
+                        "key": field, 
+                        "value": sharepoint_metadata[field]
+                    })
+                    added_fields_count += 1
+            
+            self._log_info(f"Added {added_fields_count} SharePoint-specific fields to metadata")
+            
+            # Additional technical metadata
+            self._log_info(f"Adding technical metadata for document {document.get('_id', 'unknown')}")
+            metadata_pairs.append({"key": "Object Type", "value": document.get("object_type")})
+            metadata_pairs.append({"key": "Document ID", "value": document.get("_id")})
+            metadata_pairs.append({
+                "key": "Last Modified", 
+                "value": document.get("_timestamp") or document.get("lastModifiedDateTime")
+            })
+            
+            # Size information for files
+            if document.get("size"):
+                self._log_info(f"Found file size: {document.get('size')} bytes")
+                metadata_pairs.append({"key": "File Size", "value": document.get("size")})
+            
+            # Creator information
+            created_by = None
+            if document.get("createdBy", {}).get("user", {}).get("displayName"):
+                created_by = document["createdBy"]["user"]["displayName"]
+                self._log_info(f"Found creator display name: {created_by}")
+            elif document.get("createdBy", {}).get("user", {}).get("email"):
+                created_by = document["createdBy"]["user"]["email"]
+                self._log_info(f"Found creator email: {created_by}")
+            metadata_pairs.append({"key": "Created By", "value": created_by})
+            
+            # Modified by information  
+            modified_by = None
+            if document.get("lastModifiedBy", {}).get("user", {}).get("displayName"):
+                modified_by = document["lastModifiedBy"]["user"]["displayName"]
+                self._log_info(f"Found last modifier display name: {modified_by}")
+            elif document.get("lastModifiedBy", {}).get("user", {}).get("email"):
+                modified_by = document["lastModifiedBy"]["user"]["email"]
+                self._log_info(f"Found last modifier email: {modified_by}")
+            metadata_pairs.append({"key": "Modified By", "value": modified_by})
+            
+            self._log_info(f"Built {len(metadata_pairs)} metadata pairs for document {document.get('_id')}")
+            
+        except Exception as e:
+            self._log_warning(f"Error building metadata array for document {document.get('_id')}: {str(e)}")
+            # Return minimal metadata on error
+            self._log_info("Returning minimal metadata due to error")
+            metadata_pairs = [
+                {"key": "Category", "value": None},
+                {"key": "Site Name", "value": None},
+                {"key": "Document Library", "value": None},
+                {"key": "Division", "value": None},
+                {"key": "Department", "value": None},
+                {"key": "Content-Type", "value": None},
+                {"key": "File Type", "value": None},
+                {"key": "File Path", "value": None}
+            ]
+        
+        return metadata_pairs
+
+    def enrich_document_with_metadata(
+        self, 
+        document: Dict[str, Any], 
+        site: Optional[Dict[str, Any]] = None, 
+        site_drive: Optional[Dict[str, Any]] = None, 
+        site_list: Optional[Dict[str, Any]] = None,
+        enrich_metadata_enabled: bool = True
+    ) -> Dict[str, Any]:
+
+        if not enrich_metadata_enabled:
+            self._log_info(f"Metadata enrichment disabled for document {document.get('_id', 'unknown')}")
+            return document
+            
+        # Create a copy of the document to avoid modifying the original
+        self._log_info(f"Starting metadata enrichment for document {document.get('_id', 'unknown')}")
+        enriched_document = document.copy()
+            
+        try:
+            metadata_array = self.build_metadata_array(enriched_document, site, site_drive, site_list)
+            enriched_document["metadata"] = metadata_array
+            
+            self._log_info(f"Successfully enriched document {enriched_document.get('_id')} with {len(metadata_array)} metadata pairs")
+            
+        except Exception as e:
+            self._log_warning(f"Failed to enrich document {enriched_document.get('_id')} with metadata: {str(e)}")
+            # Ensure at least an empty metadata array with required fields
+            self._log_info("Setting fallback metadata array with required fields")
+            enriched_document["metadata"] = [
+                {"key": "Category", "value": None},
+                {"key": "Site Name", "value": None},
+                {"key": "Document Library", "value": None},
+                {"key": "Division", "value": None},
+                {"key": "Department", "value": None},
+                {"key": "Content-Type", "value": None},
+                {"key": "File Type", "value": None},
+                {"key": "File Path", "value": None}
+            ]
+        
+        self._log_info(f"Completed metadata enrichment for document {enriched_document.get('_id', 'unknown')}")
+        return enriched_document
diff --git a/connectors/sources/sharepoint_online.py b/connectors/sources/sharepoint_online.py
index 5e608e990..8a612e1a5 100644
--- a/connectors/sources/sharepoint_online.py
+++ b/connectors/sources/sharepoint_online.py
@@ -33,6 +33,7 @@
 )
 from connectors.logger import logger
 from connectors.source import CURSOR_SYNC_TIMESTAMP, BaseDataSource
+from connectors.sources.sharepoint_metadata_enricher import SharePointMetadataEnricher
 from connectors.utils import (
     TIKA_SUPPORTED_FILETYPES,
     CacheWithTimeout,
@@ -1260,9 +1261,18 @@ def __init__(self, configuration):
 
         self._client = None
         self.site_group_cache = {}
+        self._metadata_enricher = None
 
     def _set_internal_logger(self):
         self.client.set_logger(self._logger)
+        # Initialize metadata enricher with logger
+        self._metadata_enricher = SharePointMetadataEnricher(logger=self._logger)
+
+    @property
+    def metadata_enricher(self):
+        if not self._metadata_enricher:
+            self._metadata_enricher = SharePointMetadataEnricher(logger=self._logger)
+        return self._metadata_enricher
 
     @property
     def client(self):
@@ -1495,210 +1505,9 @@ def _decorate_with_access_control(self, document, access_control):
 
         return document
 
-    def _extract_metadata_from_sharepoint_fields(self, document, site=None, site_drive=None, site_list=None):
-        """
-        Extract metadata from SharePoint fields and map them to standardized metadata structure.
-        
-        Args:
-            document: The SharePoint document/item
-            site: Site information (optional)
-            site_drive: Drive information (optional) 
-            site_list: List information (optional)
-            
-        Returns:
-            dict: Extracted metadata as key-value pairs
-        """
-        metadata = {}
-        
-
-        fields = document.get("fields", {})
-        
-        # Map SharePoint managed properties to your metadata structure
-        # Based on your SEARCH Managed properties table
-        # Category mapping
-        odc_category = fields.get("OPDCategory") or fields.get("ODCCategory")
-        if odc_category:
-            metadata["Category"] = odc_category
-        else:
-            # Try to determine category from site URL or other indicators
-            if site and site.get("webUrl"):
-                site_url = site["webUrl"].lower()
-                if "odc" in site_url:
-                    metadata["Category"] = "ODC"
-
-                    # Business Unit / Division
-                    metadata["Division"] = fields.get("BusinessUnit")
-                    metadata["Department"] = fields.get("BusinessUnit")  # Using same field as Department
-                    
-                    # Document Type
-                    metadata["Content-Type"] = fields.get("DocumentType") or self._determine_content_type(document)
-                    
-                    # Activity and Project information
-                    metadata["ActivityID"] = fields.get("ActivityID")
-                    metadata["ActivityName"] = fields.get("ActivityName")
-                    metadata["ProjectID"] = fields.get("ProjectID")
-                    metadata["ProjectType"] = fields.get("ProjectType")
-                    
-                    # Geographic and temporal metadata
-                    metadata["Region"] = fields.get("Region")
-                    metadata["FocusCountry"] = fields.get("FocusCountry")
-                    metadata["Year"] = fields.get("Year")
-                    metadata["Phase"] = fields.get("Phase")
-                    
-                    # Status and classification
-                    metadata["Status"] = fields.get("OPDStatus")
-                    metadata["GrantType"] = fields.get("GrantType")
-                    metadata["GrantWindow"] = fields.get("GrantWindow")
-                    
-                    # Boolean flags
-                    metadata["Disclosable"] = fields.get("Disclosable")
-                    metadata["NonIFAD"] = fields.get("NonIfad")
-                    metadata["PLF"] = fields.get("PLF")
-                    
-                    # System information
-                    metadata["SystemSource"] = fields.get("ODCIntegration_SystemSource")
-        
-                elif "xdesk" in site_url:
-                    metadata["Category"] = "Xdesk"
-                else:
-                    metadata["Category"] = None
-            else:
-                metadata["Category"] = None
-        
-  
-        return metadata
-
-    def _determine_content_type(self, document):
-        """
-        Determine content type based on document properties.
-        
-        Args:
-            document: The SharePoint document
-            
-        Returns:
-            str: Content type classification
-        """
-        object_type = document.get("object_type", "")
-        
-        if object_type == "drive_item":
-            name = document.get("name", "")
-            if "folder" in document:
-                return "Folder"
-            elif name:
-                ext = os.path.splitext(name)[-1].lower()
-                if ext in ['.ppt', '.pptx']:
-                    return "Presentation"
-                elif ext in ['.doc', '.docx', '.pdf']:
-                    return "Document"
-                elif ext in ['.xls', '.xlsx']:
-                    return "Spreadsheet"
-                elif ext in ['.mp4', '.avi', '.mov']:
-                    return "Video"
-                elif ext in ['.jpg', '.jpeg', '.png', '.gif']:
-                    return "Image"
-                else:
-                    return "Document"
-        elif object_type == "site_page":
-            return "Web Page"
-        elif object_type == "list_item":
-            return "List Item"
-        elif object_type == "list_item_attachment":
-            return "Attachment"
-        else:
-            return "Document"
-
-    def _build_metadata_array(self, document, site=None, site_drive=None, site_list=None):
-        """
-        Build metadata array as key-value pairs for the document.
-        
-        Args:
-            document: The SharePoint document
-            site: Site context
-            site_drive: Drive context  
-            site_list: List context
-            
-        Returns:
-            list: Array of metadata key-value pairs
-        """
-        metadata_pairs = []
-        
-        # Extract SharePoint-specific metadata
-        sharepoint_metadata = self._extract_metadata_from_sharepoint_fields(
-            document, site, site_drive, site_list
-        )
-        
-        # Standard metadata that should always be present
-        
-        # Site Name
-        site_name = None
-        if site:
-            site_name = site.get("displayName") or site.get("name") or site.get("title")
-        metadata_pairs.append({"key": "Site Name", "value": site_name})
-        
-        # Document Library / Drive Name
-        library_name = None
-        if site_drive:
-            library_name = site_drive.get("name") or site_drive.get("displayName")
-        elif site_list:
-            library_name = site_list.get("name") or site_list.get("displayName")
-        metadata_pairs.append({"key": "Document Library", "value": library_name})
-        
-        # File Type/Extension
-        file_extension = None
-        file_name = document.get("name") or document.get("_original_filename") or document.get("FileName", "")
-        if file_name and "." in file_name:
-            file_extension = os.path.splitext(file_name)[-1].lower()
-        metadata_pairs.append({"key": "File Type", "value": file_extension})
-        
-        # File Path/Location
-        file_path = None
-        if document.get("webUrl"):
-            file_path = document["webUrl"]
-        elif document.get("parentReference", {}).get("path"):
-            file_path = document["parentReference"]["path"]
-        elif site and site.get("webUrl"):
-            # Construct path from site URL and document name
-            site_path = self._site_path_from_web_url(site["webUrl"])
-            if file_name:
-                file_path = f"{site_path}/{file_name}"
-            else:
-                file_path = site_path
-        metadata_pairs.append({"key": "File Path", "value": file_path})
-        
-        # Add all SharePoint metadata fields
-        for key, value in sharepoint_metadata.items():
-            metadata_pairs.append({"key": key, "value": value})
-        
-        # Additional technical metadata
-        metadata_pairs.append({"key": "Object Type", "value": document.get("object_type")})
-        metadata_pairs.append({"key": "Document ID", "value": document.get("_id")})
-        metadata_pairs.append({"key": "Last Modified", "value": document.get("_timestamp") or document.get("lastModifiedDateTime")})
-        
-        # Size information for files
-        if document.get("size"):
-            metadata_pairs.append({"key": "File Size", "value": document.get("size")})
-        
-        # Creator information
-        created_by = None
-        if document.get("createdBy", {}).get("user", {}).get("displayName"):
-            created_by = document["createdBy"]["user"]["displayName"]
-        elif document.get("createdBy", {}).get("user", {}).get("email"):
-            created_by = document["createdBy"]["user"]["email"]
-        metadata_pairs.append({"key": "Created By", "value": created_by})
-        
-        # Modified by information  
-        modified_by = None
-        if document.get("lastModifiedBy", {}).get("user", {}).get("displayName"):
-            modified_by = document["lastModifiedBy"]["user"]["displayName"]
-        elif document.get("lastModifiedBy", {}).get("user", {}).get("email"):
-            modified_by = document["lastModifiedBy"]["user"]["email"]
-        metadata_pairs.append({"key": "Modified By", "value": modified_by})
-        
-        return metadata_pairs
-
     def _enrich_document_with_metadata(self, document, site=None, site_drive=None, site_list=None):
         """
-        Enrich document with metadata array.
+        Enrich document with metadata using the dedicated metadata enricher.
         
         Args:
             document: The document to enrich
@@ -1709,22 +1518,14 @@ def _enrich_document_with_metadata(self, document, site=None, site_drive=None, s
         Returns:
             dict: Document enriched with metadata
         """
-        # Check if metadata enrichment is enabled
-        if not self.configuration.get("enrich_metadata", True):
-            return document
-            
-        try:
-            metadata_array = self._build_metadata_array(document, site, site_drive, site_list)
-            document["metadata"] = metadata_array
-            
-            self._logger.debug(f"Enriched document {document.get('_id')} with {len(metadata_array)} metadata pairs")
-            
-        except Exception as e:
-            self._logger.warning(f"Failed to enrich document {document.get('_id')} with metadata: {str(e)}")
-            # Ensure at least an empty metadata array
-            document["metadata"] = []
-        
-        return document
+        enrich_enabled = bool(self.configuration.get("enrich_metadata", True))
+        return self.metadata_enricher.enrich_document_with_metadata(
+            document=document,
+            site=site,
+            site_drive=site_drive,
+            site_list=site_list,
+            enrich_metadata_enabled=enrich_enabled
+        )
 
     async def _site_access_control(self, site):
         """Fetches all permissions for all owners, members and visitors of a given site.

From b8c3e15dc643a6eea7ab3ebd54fa135cb5fc341a Mon Sep 17 00:00:00 2001
From: claudiu <c.colesnicencu@ifad.org>
Date: Thu, 31 Jul 2025 12:38:22 +0300
Subject: [PATCH 8/9] Updated metadata enricher with new data

---
 .../sources/sharepoint_metadata_enricher.py   | 200 +++++++++++++-----
 connectors/sources/sharepoint_online.py       |  47 ++--
 2 files changed, 184 insertions(+), 63 deletions(-)

diff --git a/connectors/sources/sharepoint_metadata_enricher.py b/connectors/sources/sharepoint_metadata_enricher.py
index 19c2190cd..b8ea9eea5 100644
--- a/connectors/sources/sharepoint_metadata_enricher.py
+++ b/connectors/sources/sharepoint_metadata_enricher.py
@@ -1,11 +1,18 @@
 import os
 from typing import Dict, List, Optional, Any
 
+# ODC-specific managed properties for SharePoint Graph API calls
+ODC_MANAGED_PROPERTIES = "ActivityID,BusinessUnit,OPDCategory,LOB,Division,DocumentType,FinancialYear,Quarter,Month,Owner,Reviewer,Approver,Status,Priority,Confidentiality,Retention,Compliance,RelatedProjects,Tags,Keywords,Notes"
+
+# Graph API URL constant
+GRAPH_API_URL = "https://graph.microsoft.com/v1.0"
+
 
 class SharePointMetadataEnricher:
 
-    def __init__(self, logger=None):
+    def __init__(self, logger=None, graph_api_client=None):
         self.logger = logger
+        self._graph_api_client = graph_api_client
 
     def _log_debug(self, message: str):
         if self.logger:
@@ -19,6 +26,129 @@ def _log_warning(self, message: str):
         if self.logger:
             self.logger.warning(message)
 
+    def _is_odc_site(self, site):
+        """Check if site is an ODC site based on the official ODC site URLs."""
+        if not site:
+            return False
+            
+        web_url = site.get("webUrl", "").lower()
+        
+        # Official ODC site URLs
+        odc_sites = [
+            "aprop/",
+            "lacop",
+            "esaop",
+            "nenop",
+            "wcaop",
+            "epop"
+        ]
+        
+        # Check if the site URL matches any ODC site (with or without trailing slash)
+        for odc_site in odc_sites:
+            if f"/sites/{odc_site.rstrip('/')}" in web_url:
+                return True
+                
+        return False
+
+    def get_odc_managed_properties(self):
+        """Get the ODC managed properties string for Graph API calls."""
+        return ODC_MANAGED_PROPERTIES
+
+    def should_include_odc_properties(self, site):
+        """Check if ODC properties should be included in Graph API calls for this site."""
+        return self._is_odc_site(site)
+
+    async def get_drive_list_mapping(self, site_id, site_drives_method, site_lists_method):
+        """
+        Get mapping between drives and their corresponding SharePoint lists.
+        This is needed to fetch custom metadata for drive items.
+        """
+        drive_list_mapping = {}
+        
+        try:
+            # Get all drives for the site
+            async for drive in site_drives_method(site_id):
+                drive_id = drive.get("id")
+                
+                # Get all lists for the site
+                async for site_list in site_lists_method(site_id):
+                    list_id = site_list.get("id")
+                    list_name = site_list.get("name") or site_list.get("displayName", "")
+                    
+                    # Try to match drive with list - document libraries are usually lists
+                    # This is a heuristic approach - in reality the mapping can be complex
+                    if (
+                        "document" in list_name.lower() or 
+                        "library" in list_name.lower() or
+                        list_name.lower() in drive.get("name", "").lower()
+                    ):
+                        drive_list_mapping[drive_id] = list_id
+                        self._log_info(f"Mapped drive '{drive.get('name')}' ({drive_id}) to list '{list_name}' ({list_id})")
+                        break
+                        
+        except Exception as e:
+            self._log_warning(f"Error creating drive-list mapping for site {site_id}: {str(e)}")
+            
+        return drive_list_mapping
+
+    async def get_drive_item_list_fields(self, drive_id, item_id):
+        """
+        Get custom metadata fields for a drive item via the listItem/fields endpoint.
+        This is the working approach that retrieves SharePoint custom metadata.
+        """
+        if not self._graph_api_client:
+            self._log_warning("No Graph API client available for fetching metadata fields")
+            return {}
+            
+        try:
+            url = f"{GRAPH_API_URL}/drives/{drive_id}/items/{item_id}/listItem/fields"
+            response = await self._graph_api_client.fetch(url)
+            self._log_info(f"Retrieved {len(response)} custom fields for drive item {item_id}")
+            return response
+            
+        except Exception as e:
+            if "404" in str(e) or "NotFound" in str(e):
+                # Item might be deleted or not linked to a listItem
+                self._log_debug(f"No listItem/fields found for drive item {item_id} (404)")
+            else:
+                self._log_debug(f"Failed to get listItem/fields for item {item_id}: {str(e)}")
+            return {}
+
+    async def enrich_drive_item_with_list_metadata(self, drive_item, site_id=None, drive_list_mapping=None):
+        """
+        Enrich a drive item with custom metadata using the working listItem/fields approach.
+        """
+        try:
+            # Get the drive ID and item ID
+            item_id = drive_item.get("id")
+            drive_id = None
+            
+            # Try to get drive ID from parentReference
+            parent_ref = drive_item.get("parentReference", {})
+            if parent_ref:
+                drive_id = parent_ref.get("driveId")
+            
+            if not drive_id or not item_id:
+                self._log_debug(f"Missing drive_id or item_id for drive item {item_id}")
+                return drive_item
+            
+            # Use the working approach: /drives/{drive_id}/items/{item_id}/listItem/fields
+            custom_fields = await self.get_drive_item_list_fields(drive_id, item_id)
+            
+            if custom_fields:
+                enriched_item = drive_item.copy()
+                # Add the SharePoint list fields to the drive item
+                enriched_item["fields"] = custom_fields
+                self._log_info(f"Enriched drive item {item_id} with {len(custom_fields)} SharePoint fields from listItem/fields")
+                return enriched_item
+            else:
+                self._log_debug(f"No SharePoint listItem/fields found for drive item {item_id}")
+                
+        except Exception as e:
+            self._log_warning(f"Error enriching drive item {drive_item.get('id')} with listItem/fields metadata: {str(e)}")
+            
+        return drive_item
+
     def _extract_metadata_from_sharepoint_fields(
         self, 
         document: Dict[str, Any], 
@@ -26,32 +156,26 @@ def _extract_metadata_from_sharepoint_fields(
         site_drive: Optional[Dict[str, Any]] = None, 
         site_list: Optional[Dict[str, Any]] = None
     ) -> Dict[str, Any]:
-        self._log_info(f"Starting metadata extraction for document {document.get('_id', 'unknown')}")
+        self._log_info(f"Extracting metadata for document {document.get('_id', 'unknown')}")
         
         metadata = {}
         fields = document.get("fields", {})
         
-        self._log_debug(f"SharePoint fields for document {document.get('_id', 'unknown')}: {fields}")
-        self._log_info(f"Found {len(fields)} SharePoint fields for document {document.get('_id', 'unknown')}")
-        
+        # Determine category from site URL using ODC site detection
         if site and site.get("webUrl"):
-            site_url = site["webUrl"].lower()
-            if "odc" in site_url:
+            if self._is_odc_site(site):
                 metadata["Category"] = "ODC"
-                self._log_info(f"Detected ODC category from site URL: {site_url}")
-            elif "xdesk" in site_url:
-                metadata["Category"] = "Xdesk"
-                self._log_info(f"Detected Xdesk category from site URL: {site_url}")
             else:
-                metadata["Category"] = None
-                self._log_info(f"No specific category detected from site URL: {site_url}")
+                site_url = site["webUrl"].lower()
+                if "xdesk" in site_url:
+                    metadata["Category"] = "Xdesk"
+                else:
+                    metadata["Category"] = "General"
         else:
             metadata["Category"] = None
-            self._log_info("No site URL available for category detection")
 
         metadata["Division"] = fields.get("BusinessUnit")
         metadata["Department"] = fields.get("BusinessUnit") 
-        # Document Type
         metadata["Content-Type"] = fields.get("DocumentType") or self._determine_content_type(document)
         
         # Activity and Project information
@@ -59,6 +183,9 @@ def _extract_metadata_from_sharepoint_fields(
         metadata["ActivityName"] = fields.get("ActivityName")
         metadata["ProjectID"] = fields.get("ProjectID")
         metadata["ProjectType"] = fields.get("ProjectType")
+        metadata["ProjectName"] = fields.get("ProjectName")
+        metadata["ProjectTitle"] = fields.get("ProjectTitle")
+        metadata["ProjectSector"] = fields.get("ProjectSector")
         
         # Geographic and temporal metadata
         metadata["Region"] = fields.get("Region")
@@ -79,7 +206,7 @@ def _extract_metadata_from_sharepoint_fields(
         # System information
         metadata["SystemSource"] = fields.get("ODCIntegration_SystemSource")
         
-        # Additional common SharePoint fields that might be present
+        # Additional common SharePoint fields
         metadata["Title"] = fields.get("Title")
         metadata["Author"] = fields.get("Author")
         metadata["Editor"] = fields.get("Editor")
@@ -94,9 +221,8 @@ def _extract_metadata_from_sharepoint_fields(
         odc_category = fields.get("OPDCategory") or fields.get("ODCCategory")
         if odc_category:
             metadata["Category"] = odc_category
-            self._log_info(f"Override category with ODC field value: {odc_category}")
 
-        self._log_info(f"Completed metadata extraction with {len(metadata)} fields for document {document.get('_id', 'unknown')}")
+        self._log_info(f"Extracted metadata with {len([v for v in metadata.values() if v is not None])} non-null fields")
         return metadata
 
     def _determine_content_type(self, document: Dict[str, Any]) -> str:
@@ -143,8 +269,7 @@ def build_metadata_array(
         site_drive: Optional[Dict[str, Any]] = None, 
         site_list: Optional[Dict[str, Any]] = None
     ) -> List[Dict[str, Any]]:
-        # Build metadata array as key-value pairs for the document
-        self._log_info(f"Starting to build metadata array for document {document.get('_id', 'unknown')}")
+        self._log_info(f"Building metadata array for document {document.get('_id', 'unknown')}")
         metadata_pairs = []
         
         try:
@@ -153,11 +278,7 @@ def build_metadata_array(
                 document, site, site_drive, site_list
             )
             
-            self._log_info(f"Building standard metadata pairs for document {document.get('_id', 'unknown')}")
-            
             # Standard metadata that should always be present
-            
-            # Category (required field)
             metadata_pairs.append({
                 "key": "Category", 
                 "value": sharepoint_metadata.get("Category")
@@ -167,17 +288,14 @@ def build_metadata_array(
             site_name = None
             if site:
                 site_name = site.get("displayName") or site.get("name") or site.get("title")
-                self._log_info(f"Found site name: {site_name}")
             metadata_pairs.append({"key": "Site Name", "value": site_name})
             
             # Document Library / Drive Name
             library_name = None
             if site_drive:
                 library_name = site_drive.get("name") or site_drive.get("displayName")
-                self._log_info(f"Found drive library: {library_name}")
             elif site_list:
                 library_name = site_list.get("name") or site_list.get("displayName")
-                self._log_info(f"Found list library: {library_name}")
             metadata_pairs.append({"key": "Document Library", "value": library_name})
             
             # Division and Department (required fields)
@@ -201,31 +319,26 @@ def build_metadata_array(
             file_name = document.get("name") or document.get("_original_filename") or document.get("FileName", "")
             if file_name and "." in file_name:
                 file_extension = os.path.splitext(file_name)[-1].lower()
-                self._log_info(f"Detected file extension: {file_extension} for file: {file_name}")
             metadata_pairs.append({"key": "File Type", "value": file_extension})
             
             # File Path/Location
             file_path = None
             if document.get("webUrl"):
                 file_path = document["webUrl"]
-                self._log_info(f"Using document webUrl as file path: {file_path}")
             elif document.get("parentReference", {}).get("path"):
                 file_path = document["parentReference"]["path"]
-                self._log_info(f"Using parentReference path as file path: {file_path}")
             elif site and site.get("webUrl"):
-                # Construct path from site URL and document name
                 site_path = self._site_path_from_web_url(site["webUrl"])
                 if file_name:
                     file_path = f"{site_path}/{file_name}"
                 else:
                     file_path = site_path
-                self._log_info(f"Constructed file path from site URL: {file_path}")
             metadata_pairs.append({"key": "File Path", "value": file_path})
             
             # Add all SharePoint metadata fields for all documents
-            self._log_info(f"Adding SharePoint-specific fields for document {document.get('_id', 'unknown')}")
             sharepoint_fields = [
                 "ActivityID", "ActivityName", "ProjectID", "ProjectType",
+                "ProjectName", "ProjectTitle", "ProjectSector",
                 "Region", "FocusCountry", "Year", "Phase", "Status",
                 "GrantType", "GrantWindow", "Disclosable", "NonIFAD", 
                 "PLF", "SystemSource", "Title", "Author", "Editor",
@@ -233,19 +346,14 @@ def build_metadata_array(
                 "ContentType", "FileType"
             ]
             
-            added_fields_count = 0
             for field in sharepoint_fields:
                 if field in sharepoint_metadata and sharepoint_metadata[field] is not None:
                     metadata_pairs.append({
                         "key": field, 
                         "value": sharepoint_metadata[field]
                     })
-                    added_fields_count += 1
-            
-            self._log_info(f"Added {added_fields_count} SharePoint-specific fields to metadata")
             
             # Additional technical metadata
-            self._log_info(f"Adding technical metadata for document {document.get('_id', 'unknown')}")
             metadata_pairs.append({"key": "Object Type", "value": document.get("object_type")})
             metadata_pairs.append({"key": "Document ID", "value": document.get("_id")})
             metadata_pairs.append({
@@ -255,35 +363,29 @@ def build_metadata_array(
             
             # Size information for files
             if document.get("size"):
-                self._log_info(f"Found file size: {document.get('size')} bytes")
                 metadata_pairs.append({"key": "File Size", "value": document.get("size")})
             
             # Creator information
             created_by = None
             if document.get("createdBy", {}).get("user", {}).get("displayName"):
                 created_by = document["createdBy"]["user"]["displayName"]
-                self._log_info(f"Found creator display name: {created_by}")
             elif document.get("createdBy", {}).get("user", {}).get("email"):
                 created_by = document["createdBy"]["user"]["email"]
-                self._log_info(f"Found creator email: {created_by}")
             metadata_pairs.append({"key": "Created By", "value": created_by})
             
             # Modified by information  
             modified_by = None
             if document.get("lastModifiedBy", {}).get("user", {}).get("displayName"):
                 modified_by = document["lastModifiedBy"]["user"]["displayName"]
-                self._log_info(f"Found last modifier display name: {modified_by}")
             elif document.get("lastModifiedBy", {}).get("user", {}).get("email"):
                 modified_by = document["lastModifiedBy"]["user"]["email"]
-                self._log_info(f"Found last modifier email: {modified_by}")
             metadata_pairs.append({"key": "Modified By", "value": modified_by})
             
-            self._log_info(f"Built {len(metadata_pairs)} metadata pairs for document {document.get('_id')}")
+            self._log_info(f"Built {len(metadata_pairs)} metadata pairs")
             
         except Exception as e:
             self._log_warning(f"Error building metadata array for document {document.get('_id')}: {str(e)}")
             # Return minimal metadata on error
-            self._log_info("Returning minimal metadata due to error")
             metadata_pairs = [
                 {"key": "Category", "value": None},
                 {"key": "Site Name", "value": None},
@@ -307,23 +409,20 @@ def enrich_document_with_metadata(
     ) -> Dict[str, Any]:
 
         if not enrich_metadata_enabled:
-            self._log_info(f"Metadata enrichment disabled for document {document.get('_id', 'unknown')}")
             return document
             
-        # Create a copy of the document to avoid modifying the original
-        self._log_info(f"Starting metadata enrichment for document {document.get('_id', 'unknown')}")
+        self._log_info(f"Enriching document {document.get('_id', 'unknown')} with metadata")
         enriched_document = document.copy()
             
         try:
             metadata_array = self.build_metadata_array(enriched_document, site, site_drive, site_list)
             enriched_document["metadata"] = metadata_array
             
-            self._log_info(f"Successfully enriched document {enriched_document.get('_id')} with {len(metadata_array)} metadata pairs")
+            self._log_info(f"Successfully enriched document with {len(metadata_array)} metadata pairs")
             
         except Exception as e:
             self._log_warning(f"Failed to enrich document {enriched_document.get('_id')} with metadata: {str(e)}")
             # Ensure at least an empty metadata array with required fields
-            self._log_info("Setting fallback metadata array with required fields")
             enriched_document["metadata"] = [
                 {"key": "Category", "value": None},
                 {"key": "Site Name", "value": None},
@@ -335,5 +434,4 @@ def enrich_document_with_metadata(
                 {"key": "File Path", "value": None}
             ]
         
-        self._log_info(f"Completed metadata enrichment for document {enriched_document.get('_id', 'unknown')}")
         return enriched_document
diff --git a/connectors/sources/sharepoint_online.py b/connectors/sources/sharepoint_online.py
index 8a612e1a5..85576f51c 100644
--- a/connectors/sources/sharepoint_online.py
+++ b/connectors/sources/sharepoint_online.py
@@ -78,9 +78,6 @@
 # Base fields for all drive items
 DRIVE_ITEMS_FIELDS = "id,content.downloadUrl,lastModifiedDateTime,lastModifiedBy,root,deleted,file,folder,package,name,webUrl,createdBy,createdDateTime,size,parentReference"
 
-# Additional ODC-specific managed properties
-ODC_MANAGED_PROPERTIES = "ActivityID,BusinessUnit,OPDCategory,LOB,Division,DocumentType,FinancialYear,Quarter,Month,Owner,Reviewer,Approver,Status,Priority,Confidentiality,Retention,Compliance,RelatedProjects,Tags,Keywords,Notes"
-
 CURSOR_SITE_DRIVE_KEY = "site_drives"
 
 # Microsoft Graph API Delta constants
@@ -890,13 +887,21 @@ async def drive_items_delta(self, url):
             if "value" in response and len(response["value"]) > 0:
                 yield DriveItemsPage(response["value"], delta_link)
 
-    async def drive_items(self, drive_id, url=None, site=None):
+    async def drive_items(self, drive_id, url=None, site=None, metadata_enricher=None):
         # Build field list with conditional ODC properties
         fields = DRIVE_ITEMS_FIELDS
         
         # Add ODC managed properties if this is an ODC site
-        if site and self._is_odc_site(site):
-            fields = f"{DRIVE_ITEMS_FIELDS},{ODC_MANAGED_PROPERTIES}"
+        # Use metadata enricher's ODC detection if available, otherwise fall back to local method
+        is_odc = False
+        if metadata_enricher and hasattr(metadata_enricher, 'should_include_odc_properties'):
+            is_odc = metadata_enricher.should_include_odc_properties(site)
+        elif site:
+            is_odc = self._is_odc_site(site)
+            
+        if is_odc and metadata_enricher:
+            odc_properties = metadata_enricher.get_odc_managed_properties()
+            fields = f"{DRIVE_ITEMS_FIELDS},{odc_properties}"
         
         url = (
             (
@@ -1265,13 +1270,19 @@ def __init__(self, configuration):
 
     def _set_internal_logger(self):
         self.client.set_logger(self._logger)
-        # Initialize metadata enricher with logger
-        self._metadata_enricher = SharePointMetadataEnricher(logger=self._logger)
+        # Initialize metadata enricher with logger and graph api client
+        self._metadata_enricher = SharePointMetadataEnricher(
+            logger=self._logger, 
+            graph_api_client=self.client._graph_api_client
+        )
 
     @property
     def metadata_enricher(self):
         if not self._metadata_enricher:
-            self._metadata_enricher = SharePointMetadataEnricher(logger=self._logger)
+            self._metadata_enricher = SharePointMetadataEnricher(
+                logger=self._logger, 
+                graph_api_client=self.client._graph_api_client
+            )
         return self._metadata_enricher
 
     @property
@@ -1852,7 +1863,7 @@ async def get_docs(self, filtering=None):
                         None,
                     )
 
-                    async for page in self.client.drive_items(site_drive["id"], site=site):
+                    async for page in self.client.drive_items(site_drive["id"], site=site, metadata_enricher=self.metadata_enricher):
                         for drive_items_batch in iterable_batches_generator(
                             page.items, SPO_API_MAX_BATCH_SIZE
                         ):
@@ -1867,6 +1878,12 @@ async def get_docs(self, filtering=None):
                                     "lastModifiedDateTime"
                                 )
 
+                                # HYBRID APPROACH: Enrich drive item with SharePoint list metadata using listItem/fields
+                                if self.configuration.get("enrich_metadata", True):
+                                    drive_item = await self.metadata_enricher.enrich_drive_item_with_list_metadata(
+                                        drive_item, site["id"]
+                                    )
+
                                 # Enrich with metadata
                                 drive_item = self._enrich_document_with_metadata(
                                     drive_item, site=site, site_drive=site_drive
@@ -1980,7 +1997,7 @@ async def get_docs_incrementally(self, sync_cursor, filtering=None):
                     delta_link = self.get_drive_delta_link(site_drive["id"])
 
                     async for page in self.client.drive_items(
-                        drive_id=site_drive["id"], url=delta_link, site=site
+                        drive_id=site_drive["id"], url=delta_link, site=site, metadata_enricher=self.metadata_enricher
                     ):
                         for drive_items_batch in iterable_batches_generator(
                             page.items, SPO_API_MAX_BATCH_SIZE
@@ -1996,6 +2013,12 @@ async def get_docs_incrementally(self, sync_cursor, filtering=None):
                                     "lastModifiedDateTime"
                                 )
 
+                                # HYBRID APPROACH: Enrich drive item with SharePoint list metadata using listItem/fields
+                                if self.configuration.get("enrich_metadata", True):
+                                    drive_item = await self.metadata_enricher.enrich_drive_item_with_list_metadata(
+                                        drive_item, site["id"]
+                                    )
+
                                 # Enrich with metadata
                                 drive_item = self._enrich_document_with_metadata(
                                     drive_item, site=site, site_drive=site_drive
@@ -2199,7 +2222,7 @@ def _get_login_name(permissions, label):
         return self._decorate_with_access_control(drive_item, access_control)
 
     async def drive_items(self, site_drive, max_drive_item_age, site=None):
-        async for page in self.client.drive_items(site_drive["id"], site=site):
+        async for page in self.client.drive_items(site_drive["id"], site=site, metadata_enricher=self.metadata_enricher):
             for drive_item in page:
                 drive_item["_id"] = drive_item["id"]
                 drive_item["object_type"] = "drive_item"

From d54027d812232a6706f5e8ad4bba1ef5ee05e079 Mon Sep 17 00:00:00 2001
From: ygdrax <hireClaudiu@protonmail.com>
Date: Mon, 4 Aug 2025 16:52:59 +0300
Subject: [PATCH 9/9] Updated metadata structure and odc condition

---
 .../sources/sharepoint_metadata_enricher.py   | 143 +++++++++++++++---
 1 file changed, 125 insertions(+), 18 deletions(-)

diff --git a/connectors/sources/sharepoint_metadata_enricher.py b/connectors/sources/sharepoint_metadata_enricher.py
index b8ea9eea5..47e10445d 100644
--- a/connectors/sources/sharepoint_metadata_enricher.py
+++ b/connectors/sources/sharepoint_metadata_enricher.py
@@ -2,7 +2,8 @@
 from typing import Dict, List, Optional, Any
 
 # ODC-specific managed properties for SharePoint Graph API calls
-ODC_MANAGED_PROPERTIES = "ActivityID,BusinessUnit,OPDCategory,LOB,Division,DocumentType,FinancialYear,Quarter,Month,Owner,Reviewer,Approver,Status,Priority,Confidentiality,Retention,Compliance,RelatedProjects,Tags,Keywords,Notes"
+# Includes both ODC and ODP (Operations Document Portal) fields
+ODC_MANAGED_PROPERTIES = "ActivityID,BusinessUnit,OPDCategory,LOB,Division,DocumentType,FinancialYear,Quarter,Month,Owner,Reviewer,Approver,Status,Priority,Confidentiality,Retention,Compliance,RelatedProjects,Tags,Keywords,Notes,ProjectID,ProjectName,ProjectTitle,ProjectType,ProjectSector,Phase,Region,Country,FocusCountry,GrantType,GrantWindow,GrantRecipient,Theme,ShortName,AllDocuments,Disclosable,Disclosed,NonIFAD,PLF,Sensitive,BorrowerID,CopyValidation,DocumentTypeID,ODCIntegration_CIMission"
 
 # Graph API URL constant
 GRAPH_API_URL = "https://graph.microsoft.com/v1.0"
@@ -35,7 +36,7 @@ def _is_odc_site(self, site):
         
         # Official ODC site URLs
         odc_sites = [
-            "aprop/",
+            "aprop",
             "lacop",
             "esaop",
             "nenop",
@@ -161,6 +162,28 @@ def _extract_metadata_from_sharepoint_fields(
         metadata = {}
         fields = document.get("fields", {})
         
+        # Helper function to extract value from complex SharePoint field structures
+        def extract_field_value(field_data):
+            if isinstance(field_data, dict):
+                # Handle managed metadata fields with Label/TermGuid structure
+                if "Label" in field_data:
+                    return field_data["Label"]
+                # Handle lookup fields or other object structures
+                elif "DisplayName" in field_data:
+                    return field_data["DisplayName"]
+                elif "Value" in field_data:
+                    return field_data["Value"]
+            elif isinstance(field_data, list):
+                # Handle arrays of managed metadata or lookup fields
+                if field_data and isinstance(field_data[0], dict):
+                    return [extract_field_value(item) for item in field_data]
+                else:
+                    return field_data
+            else:
+                # Handle simple string/number/boolean values
+                return field_data
+            return None
+        
         # Determine category from site URL using ODC site detection
         if site and site.get("webUrl"):
             if self._is_odc_site(site):
@@ -174,37 +197,78 @@ def _extract_metadata_from_sharepoint_fields(
         else:
             metadata["Category"] = None
 
+        # Core business metadata
         metadata["Division"] = fields.get("BusinessUnit")
         metadata["Department"] = fields.get("BusinessUnit") 
-        metadata["Content-Type"] = fields.get("DocumentType") or self._determine_content_type(document)
         
-        # Activity and Project information
-        metadata["ActivityID"] = fields.get("ActivityID")
-        metadata["ActivityName"] = fields.get("ActivityName")
-        metadata["ProjectID"] = fields.get("ProjectID")
+        # Document type - handle both simple and complex field structures
+        doc_type = extract_field_value(fields.get("DocumentType"))
+        metadata["Content-Type"] = doc_type or self._determine_content_type(document)
+        
+        # Project information - handle complex field structures
+        project_id = extract_field_value(fields.get("ProjectID"))
+        if project_id:
+            metadata["ProjectID"] = project_id
+        else:
+            # Fallback to hidden field
+            metadata["ProjectID"] = fields.get("ProjectID_Hidden")
+            
         metadata["ProjectType"] = fields.get("ProjectType")
         metadata["ProjectName"] = fields.get("ProjectName")
         metadata["ProjectTitle"] = fields.get("ProjectTitle")
         metadata["ProjectSector"] = fields.get("ProjectSector")
         
+        # Handle ShortName and AllDocuments (project short names)
+        short_name = extract_field_value(fields.get("ShortName"))
+        all_documents = extract_field_value(fields.get("AllDocuments"))
+        metadata["ShortName"] = short_name
+        metadata["AllDocuments"] = all_documents
+        
         # Geographic and temporal metadata
         metadata["Region"] = fields.get("Region")
-        metadata["FocusCountry"] = fields.get("FocusCountry")
+        metadata["Country"] = fields.get("Country")
+        metadata["CountryID"] = fields.get("CountryID")
+        metadata["FocusCountryIDs"] = fields.get("FocusCountryIDs")
+        
+        # Handle complex FocusCountry field
+        focus_country = extract_field_value(fields.get("FocusCountry"))
+        metadata["FocusCountry"] = focus_country
+        
         metadata["Year"] = fields.get("Year")
         metadata["Phase"] = fields.get("Phase")
+        metadata["PhaseID"] = fields.get("PhaseID")
         
-        # Status and classification
-        metadata["Status"] = fields.get("OPDStatus") or fields.get("Status")
+        # Grant and financing information
         metadata["GrantType"] = fields.get("GrantType")
         metadata["GrantWindow"] = fields.get("GrantWindow")
+        grant_recipient = extract_field_value(fields.get("GrantRecipient"))
+        metadata["GrantRecipient"] = grant_recipient
+        metadata["BorrowerID"] = fields.get("BorrowerID")
         
-        # Boolean flags
+        # Themes and topics
+        themes = extract_field_value(fields.get("Theme"))
+        metadata["Theme"] = themes
+        
+        # Document classification and status
         metadata["Disclosable"] = fields.get("Disclosable")
+        metadata["Disclosed"] = fields.get("Disclosed")
         metadata["NonIFAD"] = fields.get("NonIfad")
         metadata["PLF"] = fields.get("PLF")
+        metadata["Sensitive"] = fields.get("Sensitive")
+        metadata["IsInDocSet"] = fields.get("IsInDocSet")
+        metadata["OPDIsLink"] = fields.get("OPDIsLink")
+        
+        # Validation and compliance
+        metadata["CopyValidation"] = fields.get("CopyValidation")
+        metadata["SentToRMS"] = fields.get("SentToRMS")
         
-        # System information
+        # System and integration fields
+        metadata["ODCIntegration_CIMission"] = fields.get("ODCIntegration_CIMission")
         metadata["SystemSource"] = fields.get("ODCIntegration_SystemSource")
+        metadata["DocumentTypeID"] = fields.get("DocumentTypeID")
+        
+        # Project reference field
+        metadata["Project"] = fields.get("Project")
         
         # Additional common SharePoint fields
         metadata["Title"] = fields.get("Title")
@@ -217,6 +281,26 @@ def _extract_metadata_from_sharepoint_fields(
         metadata["ContentType"] = fields.get("ContentType")
         metadata["FileType"] = fields.get("File_x0020_Type")
         
+        # Document icon and size
+        metadata["DocIcon"] = fields.get("DocIcon")
+        metadata["FileSizeDisplay"] = fields.get("FileSizeDisplay")
+        
+        # Document ID and linking
+        dlc_doc_id = fields.get("_dlc_DocIdUrl")
+        if isinstance(dlc_doc_id, dict) and "Description" in dlc_doc_id:
+            metadata["DocumentID"] = dlc_doc_id["Description"]
+            metadata["DocumentIDUrl"] = dlc_doc_id.get("Url")
+        
+        # Version information
+        metadata["UIVersionString"] = fields.get("_UIVersionString")
+        
+        # Activity information (for ODC compatibility)
+        metadata["ActivityID"] = fields.get("ActivityID")
+        metadata["ActivityName"] = fields.get("ActivityName")
+        
+        # Legacy status field handling
+        metadata["Status"] = fields.get("OPDStatus") or fields.get("Status")
+        
         # Check for any OPD/ODC category fields
         odc_category = fields.get("OPDCategory") or fields.get("ODCCategory")
         if odc_category:
@@ -337,13 +421,36 @@ def build_metadata_array(
             
             # Add all SharePoint metadata fields for all documents
             sharepoint_fields = [
+                # Project and Activity Information
                 "ActivityID", "ActivityName", "ProjectID", "ProjectType",
-                "ProjectName", "ProjectTitle", "ProjectSector",
-                "Region", "FocusCountry", "Year", "Phase", "Status",
-                "GrantType", "GrantWindow", "Disclosable", "NonIFAD", 
-                "PLF", "SystemSource", "Title", "Author", "Editor",
-                "Created", "Modified", "FileLeafRef", "FileDirRef",
-                "ContentType", "FileType"
+                "ProjectName", "ProjectTitle", "ProjectSector", "Project",
+                "ShortName", "AllDocuments",
+                
+                # Geographic and Temporal
+                "Region", "Country", "CountryID", "FocusCountry", "FocusCountryIDs",
+                "Year", "Phase", "PhaseID",
+                
+                # Grant and Financing
+                "GrantType", "GrantWindow", "GrantRecipient", "BorrowerID",
+                
+                # Themes and Classification
+                "Theme", "Status", "DocumentTypeID",
+                
+                # Flags and Status
+                "Disclosable", "Disclosed", "NonIFAD", "PLF", "Sensitive", 
+                "IsInDocSet", "OPDIsLink",
+                
+                # Validation and Compliance
+                "CopyValidation", "SentToRMS",
+                
+                # System and Integration
+                "SystemSource", "ODCIntegration_CIMission",
+                
+                # Standard SharePoint Fields
+                "Title", "Author", "Editor", "Created", "Modified", 
+                "FileLeafRef", "FileDirRef", "ContentType", "FileType",
+                "DocIcon", "FileSizeDisplay", "DocumentID", "DocumentIDUrl",
+                "UIVersionString"
             ]
             
             for field in sharepoint_fields: