diff --git a/packages/tdev/docx-grader/extractors/index.ts b/packages/tdev/docx-grader/extractors/index.ts new file mode 100644 index 000000000..e05056083 --- /dev/null +++ b/packages/tdev/docx-grader/extractors/index.ts @@ -0,0 +1,3 @@ +import { default as textExtractor } from './text'; +import { default as languageExtractor } from './language'; +export { textExtractor, languageExtractor }; diff --git a/packages/tdev/docx-grader/extractors/language.ts b/packages/tdev/docx-grader/extractors/language.ts new file mode 100644 index 000000000..d745e9c8c --- /dev/null +++ b/packages/tdev/docx-grader/extractors/language.ts @@ -0,0 +1,38 @@ +import { Node, visit, Visitor } from '..'; +import { default as extractText } from './text'; + +interface LanguagePart { + lang: string; + text: string; +} + +const getLang = (rPr: Node | string) => { + if (typeof rPr === 'string' || typeof rPr['w:lang'] !== 'object') { + return undefined; + } + if (Array.isArray(rPr['w:lang']) || typeof rPr['w:lang'].attributes !== 'object') { + return undefined; + } + return rPr['w:lang'].attributes['w:val']; +}; + +const extractor = (node: Node) => { + const languageParts: LanguagePart[] = []; + visit( + node, + (name, node) => { + return name === 'w:rPr' && getLang(node) !== undefined; + }, + (name, rPr, index, parent) => { + const lang = getLang(rPr); + if (typeof lang === 'string') { + const text = parent ? extractText(parent.node) : ''; + languageParts.push({ lang, text }); + } + return 'continue'; + } + ); + return languageParts; +}; + +export default extractor; diff --git a/packages/tdev/docx-grader/extractors/text.ts b/packages/tdev/docx-grader/extractors/text.ts new file mode 100644 index 000000000..65bf3d29e --- /dev/null +++ b/packages/tdev/docx-grader/extractors/text.ts @@ -0,0 +1,27 @@ +import { Node, visit } from '..'; + +const extractor = (node: Node) => { + const parts: string[] = []; + visit( + node, + (n) => n === 'w:p', + (_, node) => { + const inlineParts: string[] = []; + visit( + node, + (n) => n === 'w:t', + (_, node) => { + if (node['#text'] !== undefined) { + inlineParts.push(node['#text']); + } + return 'skipChildren'; + } + ); + parts.push(inlineParts.join('')); + return 'continue'; + } + ); + return parts.join('\n'); +}; + +export default extractor; diff --git a/packages/tdev/docx-grader/index.ts b/packages/tdev/docx-grader/index.ts new file mode 100644 index 000000000..794c54569 --- /dev/null +++ b/packages/tdev/docx-grader/index.ts @@ -0,0 +1,66 @@ +interface Attribute { + [key: string]: string | number | undefined; +} + +export interface Node { + attributes?: Attribute; + ['#text']?: string; + parent?: Node; + [key: string]: Node | Node[] | string | number | Attribute | undefined; +} +export type Action = 'continue' | 'break' | 'skip' | 'skipChildren'; + +export type Tester = (name: string, node: Node) => boolean; +export type Visitor = ( + name: string, + node: Node, + index: number, + parent?: { name: string; node: Node } +) => Action; + +export const visit = (node: Node, test: Tester, visit: Visitor) => { + const visitNode = ( + name: string, + node: Node, + index: number, + parent?: { name: string; node: Node } + ): Action => { + if (test(name, node)) { + const action = visit(name, node, index, parent); + if (action === 'break') { + return 'break'; + } + if (action === 'skipChildren') { + return 'continue'; + } + } + const childNames = Object.keys(node).filter( + (prop) => prop !== 'attributes' && prop !== '#text' && prop !== 'parent' + ); + childNames.forEach((name, idx) => { + const child = node[name]; + if (Array.isArray(child)) { + for (let i = 0; i < child.length; i++) { + const action = visitNode(name, child[i], i, { name, node }); + if (action === 'break') { + return 'break'; + } + if (action === 'skip') { + continue; + } + } + } else if (typeof child === 'object' && child !== null) { + const action = visitNode(name, child, -1, { name, node }); + if (action === 'break') { + return 'break'; + } + if (action === 'skip') { + return 'continue'; + } + } + }); + return 'continue'; + }; + + return visitNode('root', node, -1); +}; diff --git a/packages/tdev/docx-grader/package.json b/packages/tdev/docx-grader/package.json new file mode 100644 index 000000000..ddb32dbb8 --- /dev/null +++ b/packages/tdev/docx-grader/package.json @@ -0,0 +1,18 @@ +{ + "name": "@tdev/docx-grader", + "version": "1.0.0", + "main": "index.ts", + "types": "index.ts", + "dependencies": { + "fast-xml-parser": "^5.2.5", + "fflate": "^0.8.2" + }, + "devDependencies": { + "vitest": "*", + "@docusaurus/module-type-aliases": "*", + "@docusaurus/core": "*" + }, + "peerDependencies": { + "@tdev/core": "1.0.0" + } +} diff --git a/packages/tdev/docx-grader/tsconfig.json b/packages/tdev/docx-grader/tsconfig.json new file mode 100644 index 000000000..ea56794f8 --- /dev/null +++ b/packages/tdev/docx-grader/tsconfig.json @@ -0,0 +1,3 @@ +{ + "extends": "../../../tsconfig.json" +} diff --git a/src/pages/docx/index.tsx b/src/pages/docx/index.tsx new file mode 100644 index 000000000..c765669bf --- /dev/null +++ b/src/pages/docx/index.tsx @@ -0,0 +1,155 @@ +import React from 'react'; +import Layout from '@theme/Layout'; +import { unzip } from 'fflate'; +import { XMLParser } from 'fast-xml-parser'; +import CodeBlock from '@theme/CodeBlock'; +import { visit } from '@tdev/docx-grader'; +import { languageExtractor, textExtractor } from '@tdev/docx-grader/extractors'; + +const unzipFile = (file: File): Promise<{ [path: string]: Uint8Array }> => { + return new Promise((resolve, reject) => { + const reader = new FileReader(); + reader.onload = () => { + const buffer = new Uint8Array(reader.result as ArrayBuffer); + unzip(buffer, (err, unzipped) => { + if (err) reject(err); + else resolve(unzipped); + }); + }; + reader.readAsArrayBuffer(file); + }); +}; + +const parseXmlFiles = (unzipped: { [path: string]: Uint8Array }) => { + const parser = new XMLParser({ + ignoreAttributes: false, + attributeNamePrefix: '', + attributesGroupName: 'attributes', + textNodeName: '#text', + alwaysCreateTextNode: true, + parseAttributeValue: true, + parseTagValue: true, + trimValues: false, + allowBooleanAttributes: true + }); + + const xmlFiles: { [path: string]: any } = {}; + + Object.entries(unzipped).forEach(([path, data]) => { + if (path.endsWith('.xml') || path.endsWith('.rels')) { + try { + const xmlString = new TextDecoder('utf-8').decode(data); + xmlFiles[path] = parser.parse(xmlString); + } catch (err) { + console.warn(`Failed to parse XML file ${path}:`, err); + } + } else { + xmlFiles[path] = data; // Keep non-XML files as Uint8Array + } + }); + + return xmlFiles; +}; + +export default function Home(): React.ReactNode { + const [parsedFiles, setParsedFiles] = React.useState<{ [path: string]: any } | null>(null); + const [loading, setLoading] = React.useState(false); + const [error, setError] = React.useState(null); + const [shownFile, setShownFiles] = React.useState(''); + const [plainText, setPlainText] = React.useState(''); + + const handleFileUpload = async (event: React.ChangeEvent) => { + const file = event.target.files?.[0]; + if (!file) return; + + if (!file.name.endsWith('.docx') && !file.name.endsWith('.odt')) { + setError('Please select a .docx file'); + return; + } + + setLoading(true); + setError(null); + + try { + const unzipped = await unzipFile(file); + const parsed = parseXmlFiles(unzipped); + setParsedFiles(parsed); + setShownFiles('word/document.xml'); + } catch (err) { + setError('Failed to process file: ' + (err as Error).message); + } finally { + setLoading(false); + } + }; + + React.useEffect(() => { + const parsed = parsedFiles?.[shownFile]; + if (!parsed) { + setPlainText(''); + return; + } + setPlainText(textExtractor(parsed)); + console.log(languageExtractor(parsed)); + }, [shownFile]); + + return ( + +
+

DOCX

+
+

Word Document Reader

+ +
+ +
+ + {loading &&

Loading document...

} + + {error &&
{error}
} + + {parsedFiles && ( +
+

Document Parsed Successfully

+

Found {Object.keys(parsedFiles).length} files

+
+ File Structure + {Object.keys(parsedFiles) + .filter((f) => f.endsWith('xml')) + .map((filePath) => ( +
+ +
+ ))} +
+ + {plainText} + + {parsedFiles[shownFile] && ( + + {JSON.stringify(parsedFiles[shownFile] || {}, null, 2)} + + )} +
+ )} +
+
+
+ ); +} diff --git a/yarn.lock b/yarn.lock index 33ebb9166..4e18589a8 100644 --- a/yarn.lock +++ b/yarn.lock @@ -8444,6 +8444,13 @@ fast-uri@^3.0.1: resolved "https://registry.yarnpkg.com/fast-uri/-/fast-uri-3.0.6.tgz#88f130b77cfaea2378d56bf970dea21257a68748" integrity sha512-Atfo14OibSv5wAp4VWNsFYE1AchQRTv9cBGWET4pZWHzYshFSS9NQI6I57rdKn9croWVMbYFbLhJ+yJvmZIIHw== +fast-xml-parser@^5.2.5: + version "5.2.5" + resolved "https://registry.yarnpkg.com/fast-xml-parser/-/fast-xml-parser-5.2.5.tgz#4809fdfb1310494e341098c25cb1341a01a9144a" + integrity sha512-pfX9uG9Ki0yekDHx2SiuRIyFdyAr1kMIMitPvb0YBo8SUfKvia7w7FIyd/l6av85pFYRhZscS75MwMnbvY+hcQ== + dependencies: + strnum "^2.1.0" + fastq@^1.6.0: version "1.19.1" resolved "https://registry.yarnpkg.com/fastq/-/fastq-1.19.1.tgz#d50eaba803c8846a883c16492821ebcd2cda55f5" @@ -8477,6 +8484,11 @@ feed@^4.2.2: dependencies: xml-js "^1.6.11" +fflate@^0.8.2: + version "0.8.2" + resolved "https://registry.yarnpkg.com/fflate/-/fflate-0.8.2.tgz#fc8631f5347812ad6028bbe4a2308b2792aa1dea" + integrity sha512-cPJU47OaAoCbg0pBvzsgpTPhmhqI5eJjh/JIu8tPj5q+T7iLvW/JAYUqmE7KOB4R1ZyEhzBaIQpQpardBF5z8A== + figures@^3.2.0: version "3.2.0" resolved "https://registry.yarnpkg.com/figures/-/figures-3.2.0.tgz#625c18bd293c604dc4a8ddb2febf0c88341746af" @@ -13943,6 +13955,11 @@ strip-literal@^3.0.0: dependencies: js-tokens "^9.0.1" +strnum@^2.1.0: + version "2.1.1" + resolved "https://registry.yarnpkg.com/strnum/-/strnum-2.1.1.tgz#cf2a6e0cf903728b8b2c4b971b7e36b4e82d46ab" + integrity sha512-7ZvoFTiCnGxBtDqJ//Cu6fWtZtc7Y3x+QOirG15wztbdngGSkht27o2pyGWrVy0b4WAy3jbKmnoK6g5VlVNUUw== + style-mod@^4.0.0, style-mod@^4.1.0: version "4.1.2" resolved "https://registry.yarnpkg.com/style-mod/-/style-mod-4.1.2.tgz#ca238a1ad4786520f7515a8539d5a63691d7bf67"