Skip to content

Commit 587d1b2

Browse files
committed
feat: 新增 CSV、XLSX、XLS 文件类型解析支持
1 parent 3a2c86d commit 587d1b2

File tree

7 files changed

+118
-2
lines changed

7 files changed

+118
-2
lines changed

docreader/parser/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,10 @@
1313
meaningful chunks for further processing and indexing.
1414
"""
1515

16+
from .csv_parser import CSVParser
1617
from .doc_parser import DocParser
1718
from .docx2_parser import Docx2Parser
19+
from .excel_parser import ExcelParser
1820
from .image_parser import ImageParser
1921
from .markdown_parser import MarkdownParser
2022
from .parser import Parser
@@ -32,4 +34,6 @@
3234
"ImageParser", # Parser for images with text content
3335
"WebParser", # Parser for web pages
3436
"Parser", # Main parser factory that selects the appropriate parser
37+
"CSVParser", # Parser for CSV files
38+
"ExcelParser", # Parser for Excel files
3539
]

docreader/parser/base_parser.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,9 @@ def parse(self, content: bytes) -> Document:
339339
logger.info(
340340
f"Extracted {len(document.content)} characters from {self.file_name}"
341341
)
342+
if document.chunks:
343+
return document
344+
342345
splitter = TextSplitter(
343346
chunk_size=self.chunk_size,
344347
chunk_overlap=self.chunk_overlap,

docreader/parser/csv_parser.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import logging
2+
from io import BytesIO
3+
from typing import List
4+
5+
import pandas as pd
6+
7+
from docreader.models.document import Chunk, Document
8+
from docreader.parser.base_parser import BaseParser
9+
10+
logger = logging.getLogger(__name__)
11+
12+
13+
class CSVParser(BaseParser):
14+
def parse_into_text(self, content: bytes) -> Document:
15+
chunks: List[Chunk] = []
16+
text: List[str] = []
17+
start, end = 0, 0
18+
19+
df = pd.read_csv(BytesIO(content), on_bad_lines="skip")
20+
21+
for i, (idx, row) in enumerate(df.iterrows()):
22+
content_row = (
23+
",".join(
24+
f"{col.strip()}: {str(row[col]).strip()}" for col in df.columns
25+
)
26+
+ "\n"
27+
)
28+
end += len(content_row)
29+
text.append(content_row)
30+
chunks.append(Chunk(content=content_row, seq=i, start=start, end=end))
31+
start = end
32+
33+
return Document(
34+
content="".join(text),
35+
chunks=chunks,
36+
)
37+
38+
39+
if __name__ == "__main__":
40+
logging.basicConfig(level=logging.DEBUG)
41+
42+
your_file = "/path/to/your/file.csv"
43+
parser = CSVParser()
44+
with open(your_file, "rb") as f:
45+
content = f.read()
46+
document = parser.parse_into_text(content)
47+
logger.error(document.content)
48+
49+
for chunk in document.chunks:
50+
logger.error(chunk.content)

docreader/parser/excel_parser.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import logging
2+
from io import BytesIO
3+
from typing import List
4+
5+
import pandas as pd
6+
7+
from docreader.models.document import Chunk, Document
8+
from docreader.parser.base_parser import BaseParser
9+
10+
logger = logging.getLogger(__name__)
11+
12+
13+
class ExcelParser(BaseParser):
14+
def parse_into_text(self, content: bytes) -> Document:
15+
chunks: List[Chunk] = []
16+
text: List[str] = []
17+
start, end = 0, 0
18+
19+
excel_file = pd.ExcelFile(BytesIO(content))
20+
for excel_sheet_name in excel_file.sheet_names:
21+
df = excel_file.parse(sheet_name=excel_sheet_name)
22+
df.dropna(how="all", inplace=True)
23+
24+
for _, row in df.iterrows():
25+
page_content = []
26+
for k, v in row.items():
27+
if pd.notna(v):
28+
page_content.append(f"{k}: {v}")
29+
if not page_content:
30+
continue
31+
content_row = ",".join(page_content) + "\n"
32+
end += len(content_row)
33+
text.append(content_row)
34+
chunks.append(
35+
Chunk(content=content_row, seq=len(chunks), start=start, end=end)
36+
)
37+
start = end
38+
39+
return Document(content="".join(text), chunks=chunks)
40+
41+
42+
if __name__ == "__main__":
43+
logging.basicConfig(level=logging.DEBUG)
44+
45+
your_file = "/path/to/your/file.xlsx"
46+
parser = ExcelParser()
47+
with open(your_file, "rb") as f:
48+
content = f.read()
49+
document = parser.parse_into_text(content)
50+
logger.error(document.content)
51+
52+
for chunk in document.chunks:
53+
logger.error(chunk.content)
54+
break

docreader/parser/parser.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@
44
from docreader.models.document import Document
55
from docreader.models.read_config import ChunkingConfig
66
from docreader.parser.base_parser import BaseParser
7+
from docreader.parser.csv_parser import CSVParser
78
from docreader.parser.doc_parser import DocParser
89
from docreader.parser.docx2_parser import Docx2Parser
10+
from docreader.parser.excel_parser import ExcelParser
911
from docreader.parser.image_parser import ImageParser
1012
from docreader.parser.markdown_parser import MarkdownParser
1113
from docreader.parser.pdf_parser import PDFParser
@@ -37,6 +39,9 @@ def __init__(self):
3739
"tiff": ImageParser,
3840
"webp": ImageParser,
3941
"markdown": MarkdownParser,
42+
"csv": CSVParser,
43+
"xlsx": ExcelParser,
44+
"xls": ExcelParser,
4045
}
4146
logger.info(
4247
"Parser initialized with %d parsers: %s",

frontend/src/utils/index.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ export function formatStringDate(date: any) {
2323
);
2424
}
2525
export function kbFileTypeVerification(file: any) {
26-
let validTypes = ["pdf", "txt", "md", "docx", "doc", "jpg", "jpeg", "png"];
26+
let validTypes = ["pdf", "txt", "md", "docx", "doc", "jpg", "jpeg", "png", "csv", "xlsx", "xls"];
2727
let type = file.name.substring(file.name.lastIndexOf(".") + 1);
2828
if (!validTypes.includes(type)) {
2929
MessagePlugin.error("文件类型错误!");

internal/application/service/knowledge.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1353,7 +1353,7 @@ func (s *knowledgeService) UpdateKnowledge(ctx context.Context, knowledge *types
13531353
// isValidFileType checks if a file type is supported
13541354
func isValidFileType(filename string) bool {
13551355
switch strings.ToLower(getFileType(filename)) {
1356-
case "pdf", "txt", "docx", "doc", "md", "markdown", "png", "jpg", "jpeg", "gif":
1356+
case "pdf", "txt", "docx", "doc", "md", "markdown", "png", "jpg", "jpeg", "gif", "csv", "xlsx", "xls":
13571357
return true
13581358
default:
13591359
return false

0 commit comments

Comments
 (0)