From 141c8c3eb725a6531eca8ebf8858a8c30e26c859 Mon Sep 17 00:00:00 2001 From: bh0fer Date: Wed, 25 Jun 2025 11:55:30 +0200 Subject: [PATCH 1/4] feature: docx grader POC of the reading, parsing and evaluating word docs --- package.json | 2 + src/pages/docx/index.tsx | 173 ++++++++++++++++++++++++++++++++++++++ src/pages/docx/visitor.ts | 58 +++++++++++++ yarn.lock | 17 ++++ 4 files changed, 250 insertions(+) create mode 100644 src/pages/docx/index.tsx create mode 100644 src/pages/docx/visitor.ts diff --git a/package.json b/package.json index 641062fb8..fbfa17700 100644 --- a/package.json +++ b/package.json @@ -53,6 +53,8 @@ "docusaurus-plugin-sass": "^0.2.6", "dotenv": "^16.5.0", "exceljs": "^4.4.0", + "fast-xml-parser": "^5.2.5", + "fflate": "^0.8.2", "gray-matter": "^4.0.3", "heic2any": "^0.0.4", "js-yaml": "^4.1.0", diff --git a/src/pages/docx/index.tsx b/src/pages/docx/index.tsx new file mode 100644 index 000000000..fec585991 --- /dev/null +++ b/src/pages/docx/index.tsx @@ -0,0 +1,173 @@ +import React from 'react'; +import Layout from '@theme/Layout'; +import { unzip } from 'fflate'; +import { XMLParser } from 'fast-xml-parser'; +import CodeBlock from '@theme/CodeBlock'; +import { visit } from './visitor'; + +const unzipFile = (file: File): Promise<{ [path: string]: Uint8Array }> => { + return new Promise((resolve, reject) => { + const reader = new FileReader(); + reader.onload = () => { + const buffer = new Uint8Array(reader.result as ArrayBuffer); + unzip(buffer, (err, unzipped) => { + if (err) reject(err); + else resolve(unzipped); + }); + }; + reader.readAsArrayBuffer(file); + }); +}; + +const parseXmlFiles = (unzipped: { [path: string]: Uint8Array }) => { + const parser = new XMLParser({ + ignoreAttributes: false, + attributeNamePrefix: '', + attributesGroupName: 'attributes', + textNodeName: '#text', + alwaysCreateTextNode: true, + parseAttributeValue: true, + parseTagValue: true, + trimValues: false, + allowBooleanAttributes: true + }); + + const xmlFiles: { [path: string]: any } = {}; + + Object.entries(unzipped).forEach(([path, data]) => { + if (path.endsWith('.xml') || path.endsWith('.rels')) { + try { + const xmlString = new TextDecoder('utf-8').decode(data); + xmlFiles[path] = parser.parse(xmlString); + } catch (err) { + console.warn(`Failed to parse XML file ${path}:`, err); + } + } else { + xmlFiles[path] = data; // Keep non-XML files as Uint8Array + } + }); + + return xmlFiles; +}; + +export default function Home(): React.ReactNode { + const [parsedFiles, setParsedFiles] = React.useState<{ [path: string]: any } | null>(null); + const [loading, setLoading] = React.useState(false); + const [error, setError] = React.useState(null); + const [shownFile, setShownFiles] = React.useState(''); + const [plainText, setPlainText] = React.useState(''); + + const handleFileUpload = async (event: React.ChangeEvent) => { + const file = event.target.files?.[0]; + if (!file) return; + + if (!file.name.endsWith('.docx') && !file.name.endsWith('.odt')) { + setError('Please select a .docx file'); + return; + } + + setLoading(true); + setError(null); + + try { + const unzipped = await unzipFile(file); + const parsed = parseXmlFiles(unzipped); + setParsedFiles(parsed); + setShownFiles('word/document.xml'); + } catch (err) { + setError('Failed to process file: ' + (err as Error).message); + } finally { + setLoading(false); + } + }; + + React.useEffect(() => { + const parsed = parsedFiles?.[shownFile]; + if (!parsed) { + setPlainText(''); + return; + } + const parts: string[] = []; + visit( + parsed, + (n) => n === 'w:p', + (_, node) => { + const inlineParts: string[] = []; + visit( + node, + (n) => n === 'w:t', + (_, node) => { + if (node['#text'] !== undefined) { + inlineParts.push(node['#text']); + } + return 'skipChildren'; + } + ); + parts.push(inlineParts.join('')); + return 'continue'; + } + ); + setPlainText(parts.join('\n')); + }, [shownFile]); + + return ( + +
+

DOCX

+
+

Word Document Reader

+ +
+ +
+ + {loading &&

Loading document...

} + + {error &&
{error}
} + + {parsedFiles && ( +
+

Document Parsed Successfully

+

Found {Object.keys(parsedFiles).length} files

+
+ File Structure + {Object.keys(parsedFiles) + .filter((f) => f.endsWith('xml')) + .map((filePath) => ( +
+ +
+ ))} +
+ + {plainText} + + {parsedFiles[shownFile] && ( + + {JSON.stringify(parsedFiles[shownFile] || {}, null, 2)} + + )} +
+ )} +
+
+
+ ); +} diff --git a/src/pages/docx/visitor.ts b/src/pages/docx/visitor.ts new file mode 100644 index 000000000..74deb85d9 --- /dev/null +++ b/src/pages/docx/visitor.ts @@ -0,0 +1,58 @@ +interface Attribute { + [key: string]: string | number | undefined; +} + +export interface Node { + attributes?: Attribute; + ['#text']?: string; + [key: string]: Node | Node[] | string | number | Attribute | undefined; +} +export type Action = 'continue' | 'break' | 'skip' | 'skipChildren'; +export const visit = ( + node: Node, + test: (name: string, node: Node) => boolean, + visit: (name: string, node: Node, index: number, parent?: { name: string; node: Node }) => Action +) => { + const visitNode = ( + name: string, + node: Node, + index: number, + parent?: { name: string; node: Node } + ): Action => { + if (test(name, node)) { + const action = visit(name, node, index, parent); + if (action === 'break') { + return 'break'; + } + if (action === 'skipChildren') { + return 'continue'; + } + } + const childNames = Object.keys(node).filter((prop) => prop !== 'attributes' && prop !== '#text'); + childNames.forEach((name, idx) => { + const child = node[name]; + if (Array.isArray(child)) { + for (let i = 0; i < child.length; i++) { + const action = visitNode(name, child[i], i, { name, node }); + if (action === 'break') { + return 'break'; + } + if (action === 'skip') { + continue; + } + } + } else if (typeof child === 'object' && child !== null) { + const action = visitNode(name, child, -1, { name, node }); + if (action === 'break') { + return 'break'; + } + if (action === 'skip') { + return 'continue'; + } + } + }); + return 'continue'; + }; + + return visitNode('root', node, -1); +}; diff --git a/yarn.lock b/yarn.lock index 33ebb9166..4e18589a8 100644 --- a/yarn.lock +++ b/yarn.lock @@ -8444,6 +8444,13 @@ fast-uri@^3.0.1: resolved "https://registry.yarnpkg.com/fast-uri/-/fast-uri-3.0.6.tgz#88f130b77cfaea2378d56bf970dea21257a68748" integrity sha512-Atfo14OibSv5wAp4VWNsFYE1AchQRTv9cBGWET4pZWHzYshFSS9NQI6I57rdKn9croWVMbYFbLhJ+yJvmZIIHw== +fast-xml-parser@^5.2.5: + version "5.2.5" + resolved "https://registry.yarnpkg.com/fast-xml-parser/-/fast-xml-parser-5.2.5.tgz#4809fdfb1310494e341098c25cb1341a01a9144a" + integrity sha512-pfX9uG9Ki0yekDHx2SiuRIyFdyAr1kMIMitPvb0YBo8SUfKvia7w7FIyd/l6av85pFYRhZscS75MwMnbvY+hcQ== + dependencies: + strnum "^2.1.0" + fastq@^1.6.0: version "1.19.1" resolved "https://registry.yarnpkg.com/fastq/-/fastq-1.19.1.tgz#d50eaba803c8846a883c16492821ebcd2cda55f5" @@ -8477,6 +8484,11 @@ feed@^4.2.2: dependencies: xml-js "^1.6.11" +fflate@^0.8.2: + version "0.8.2" + resolved "https://registry.yarnpkg.com/fflate/-/fflate-0.8.2.tgz#fc8631f5347812ad6028bbe4a2308b2792aa1dea" + integrity sha512-cPJU47OaAoCbg0pBvzsgpTPhmhqI5eJjh/JIu8tPj5q+T7iLvW/JAYUqmE7KOB4R1ZyEhzBaIQpQpardBF5z8A== + figures@^3.2.0: version "3.2.0" resolved "https://registry.yarnpkg.com/figures/-/figures-3.2.0.tgz#625c18bd293c604dc4a8ddb2febf0c88341746af" @@ -13943,6 +13955,11 @@ strip-literal@^3.0.0: dependencies: js-tokens "^9.0.1" +strnum@^2.1.0: + version "2.1.1" + resolved "https://registry.yarnpkg.com/strnum/-/strnum-2.1.1.tgz#cf2a6e0cf903728b8b2c4b971b7e36b4e82d46ab" + integrity sha512-7ZvoFTiCnGxBtDqJ//Cu6fWtZtc7Y3x+QOirG15wztbdngGSkht27o2pyGWrVy0b4WAy3jbKmnoK6g5VlVNUUw== + style-mod@^4.0.0, style-mod@^4.1.0: version "4.1.2" resolved "https://registry.yarnpkg.com/style-mod/-/style-mod-4.1.2.tgz#ca238a1ad4786520f7515a8539d5a63691d7bf67" From 2c339b84ca3762638472ee8fbdedb3a8fa1b774e Mon Sep 17 00:00:00 2001 From: bh0fer Date: Wed, 25 Jun 2025 12:21:16 +0200 Subject: [PATCH 2/4] factor deps to its own package --- package.json | 2 -- .../tdev/docx-grader/index.ts | 0 packages/tdev/docx-grader/package.json | 18 ++++++++++++++++++ packages/tdev/docx-grader/tsconfig.json | 3 +++ src/pages/docx/index.tsx | 2 +- 5 files changed, 22 insertions(+), 3 deletions(-) rename src/pages/docx/visitor.ts => packages/tdev/docx-grader/index.ts (100%) create mode 100644 packages/tdev/docx-grader/package.json create mode 100644 packages/tdev/docx-grader/tsconfig.json diff --git a/package.json b/package.json index fbfa17700..641062fb8 100644 --- a/package.json +++ b/package.json @@ -53,8 +53,6 @@ "docusaurus-plugin-sass": "^0.2.6", "dotenv": "^16.5.0", "exceljs": "^4.4.0", - "fast-xml-parser": "^5.2.5", - "fflate": "^0.8.2", "gray-matter": "^4.0.3", "heic2any": "^0.0.4", "js-yaml": "^4.1.0", diff --git a/src/pages/docx/visitor.ts b/packages/tdev/docx-grader/index.ts similarity index 100% rename from src/pages/docx/visitor.ts rename to packages/tdev/docx-grader/index.ts diff --git a/packages/tdev/docx-grader/package.json b/packages/tdev/docx-grader/package.json new file mode 100644 index 000000000..2ae6028d2 --- /dev/null +++ b/packages/tdev/docx-grader/package.json @@ -0,0 +1,18 @@ +{ + "name": "@tdev/docx-grader", + "version": "1.0.0", + "main": "index.ts", + "types": "index.ts", + "dependencies": { + "fast-xml-parser": "^5.2.5", + "fflate": "^0.8.2" + }, + "devDependencies": { + "vitest": "*", + "@docusaurus/module-type-aliases": "*", + "@docusaurus/core": "*" + }, + "peerDependencies": { + "@tdev/core": "1.0.0" + } +} \ No newline at end of file diff --git a/packages/tdev/docx-grader/tsconfig.json b/packages/tdev/docx-grader/tsconfig.json new file mode 100644 index 000000000..ea56794f8 --- /dev/null +++ b/packages/tdev/docx-grader/tsconfig.json @@ -0,0 +1,3 @@ +{ + "extends": "../../../tsconfig.json" +} diff --git a/src/pages/docx/index.tsx b/src/pages/docx/index.tsx index fec585991..d8500ce9e 100644 --- a/src/pages/docx/index.tsx +++ b/src/pages/docx/index.tsx @@ -3,7 +3,7 @@ import Layout from '@theme/Layout'; import { unzip } from 'fflate'; import { XMLParser } from 'fast-xml-parser'; import CodeBlock from '@theme/CodeBlock'; -import { visit } from './visitor'; +import { visit } from '@tdev/docx-grader'; const unzipFile = (file: File): Promise<{ [path: string]: Uint8Array }> => { return new Promise((resolve, reject) => { From fa03efaa2dccb891a8c40dd3cf2133ee308be409 Mon Sep 17 00:00:00 2001 From: bh0fer Date: Wed, 25 Jun 2025 12:24:20 +0200 Subject: [PATCH 3/4] fix format --- packages/tdev/docx-grader/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/tdev/docx-grader/package.json b/packages/tdev/docx-grader/package.json index 2ae6028d2..ddb32dbb8 100644 --- a/packages/tdev/docx-grader/package.json +++ b/packages/tdev/docx-grader/package.json @@ -15,4 +15,4 @@ "peerDependencies": { "@tdev/core": "1.0.0" } -} \ No newline at end of file +} From 455d3bd73cbd4fb8d43a5a9a4b3b80165224b007 Mon Sep 17 00:00:00 2001 From: bh0fer Date: Wed, 25 Jun 2025 15:51:45 +0200 Subject: [PATCH 4/4] add some poc extractors --- packages/tdev/docx-grader/extractors/index.ts | 3 ++ .../tdev/docx-grader/extractors/language.ts | 38 +++++++++++++++++++ packages/tdev/docx-grader/extractors/text.ts | 27 +++++++++++++ packages/tdev/docx-grader/index.ts | 18 ++++++--- src/pages/docx/index.tsx | 24 ++---------- 5 files changed, 84 insertions(+), 26 deletions(-) create mode 100644 packages/tdev/docx-grader/extractors/index.ts create mode 100644 packages/tdev/docx-grader/extractors/language.ts create mode 100644 packages/tdev/docx-grader/extractors/text.ts diff --git a/packages/tdev/docx-grader/extractors/index.ts b/packages/tdev/docx-grader/extractors/index.ts new file mode 100644 index 000000000..e05056083 --- /dev/null +++ b/packages/tdev/docx-grader/extractors/index.ts @@ -0,0 +1,3 @@ +import { default as textExtractor } from './text'; +import { default as languageExtractor } from './language'; +export { textExtractor, languageExtractor }; diff --git a/packages/tdev/docx-grader/extractors/language.ts b/packages/tdev/docx-grader/extractors/language.ts new file mode 100644 index 000000000..d745e9c8c --- /dev/null +++ b/packages/tdev/docx-grader/extractors/language.ts @@ -0,0 +1,38 @@ +import { Node, visit, Visitor } from '..'; +import { default as extractText } from './text'; + +interface LanguagePart { + lang: string; + text: string; +} + +const getLang = (rPr: Node | string) => { + if (typeof rPr === 'string' || typeof rPr['w:lang'] !== 'object') { + return undefined; + } + if (Array.isArray(rPr['w:lang']) || typeof rPr['w:lang'].attributes !== 'object') { + return undefined; + } + return rPr['w:lang'].attributes['w:val']; +}; + +const extractor = (node: Node) => { + const languageParts: LanguagePart[] = []; + visit( + node, + (name, node) => { + return name === 'w:rPr' && getLang(node) !== undefined; + }, + (name, rPr, index, parent) => { + const lang = getLang(rPr); + if (typeof lang === 'string') { + const text = parent ? extractText(parent.node) : ''; + languageParts.push({ lang, text }); + } + return 'continue'; + } + ); + return languageParts; +}; + +export default extractor; diff --git a/packages/tdev/docx-grader/extractors/text.ts b/packages/tdev/docx-grader/extractors/text.ts new file mode 100644 index 000000000..65bf3d29e --- /dev/null +++ b/packages/tdev/docx-grader/extractors/text.ts @@ -0,0 +1,27 @@ +import { Node, visit } from '..'; + +const extractor = (node: Node) => { + const parts: string[] = []; + visit( + node, + (n) => n === 'w:p', + (_, node) => { + const inlineParts: string[] = []; + visit( + node, + (n) => n === 'w:t', + (_, node) => { + if (node['#text'] !== undefined) { + inlineParts.push(node['#text']); + } + return 'skipChildren'; + } + ); + parts.push(inlineParts.join('')); + return 'continue'; + } + ); + return parts.join('\n'); +}; + +export default extractor; diff --git a/packages/tdev/docx-grader/index.ts b/packages/tdev/docx-grader/index.ts index 74deb85d9..794c54569 100644 --- a/packages/tdev/docx-grader/index.ts +++ b/packages/tdev/docx-grader/index.ts @@ -5,14 +5,20 @@ interface Attribute { export interface Node { attributes?: Attribute; ['#text']?: string; + parent?: Node; [key: string]: Node | Node[] | string | number | Attribute | undefined; } export type Action = 'continue' | 'break' | 'skip' | 'skipChildren'; -export const visit = ( + +export type Tester = (name: string, node: Node) => boolean; +export type Visitor = ( + name: string, node: Node, - test: (name: string, node: Node) => boolean, - visit: (name: string, node: Node, index: number, parent?: { name: string; node: Node }) => Action -) => { + index: number, + parent?: { name: string; node: Node } +) => Action; + +export const visit = (node: Node, test: Tester, visit: Visitor) => { const visitNode = ( name: string, node: Node, @@ -28,7 +34,9 @@ export const visit = ( return 'continue'; } } - const childNames = Object.keys(node).filter((prop) => prop !== 'attributes' && prop !== '#text'); + const childNames = Object.keys(node).filter( + (prop) => prop !== 'attributes' && prop !== '#text' && prop !== 'parent' + ); childNames.forEach((name, idx) => { const child = node[name]; if (Array.isArray(child)) { diff --git a/src/pages/docx/index.tsx b/src/pages/docx/index.tsx index d8500ce9e..c765669bf 100644 --- a/src/pages/docx/index.tsx +++ b/src/pages/docx/index.tsx @@ -4,6 +4,7 @@ import { unzip } from 'fflate'; import { XMLParser } from 'fast-xml-parser'; import CodeBlock from '@theme/CodeBlock'; import { visit } from '@tdev/docx-grader'; +import { languageExtractor, textExtractor } from '@tdev/docx-grader/extractors'; const unzipFile = (file: File): Promise<{ [path: string]: Uint8Array }> => { return new Promise((resolve, reject) => { @@ -87,27 +88,8 @@ export default function Home(): React.ReactNode { setPlainText(''); return; } - const parts: string[] = []; - visit( - parsed, - (n) => n === 'w:p', - (_, node) => { - const inlineParts: string[] = []; - visit( - node, - (n) => n === 'w:t', - (_, node) => { - if (node['#text'] !== undefined) { - inlineParts.push(node['#text']); - } - return 'skipChildren'; - } - ); - parts.push(inlineParts.join('')); - return 'continue'; - } - ); - setPlainText(parts.join('\n')); + setPlainText(textExtractor(parsed)); + console.log(languageExtractor(parsed)); }, [shownFile]); return (