Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions app/lib/extract.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import { Collection } from "@arsonar/client/index.js";
import { PDFExtract, PDFExtractOptions } from 'pdf.js-extract'
import { Readable } from 'node:stream'
import { file as createTempFile } from 'tmp-promise'
import fs from 'fs/promises'

export type ExtractResult = {
fullText?: string
}

export async function extractTextFromFile(collection: Collection, fileId: string): Promise<ExtractResult> {
const fileStream = await collection.files.readFile(fileId)
const { path, cleanup } = await createTempFile()
await fs.writeFile(path, Readable.from(fileStream))
const result: ExtractResult = {}
try {
const pdfExtract = new PDFExtract();
const data = await pdfExtract.extract(path)
let text = ''
// @ts-ignore
if (data.meta?.metadata && data.meta.metadata['dc:title']) text += data.meta.metadata['dc:title']
for (const page of data.pages) {
for (const area of page.content) {
text += ' ' + area.str
}
}
result.fullText = text
} catch (err) {
console.error('Text extraction failed', err)
}
await cleanup()
return result
}
3 changes: 3 additions & 0 deletions app/schema.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ export const schema: Spec = {
file: {
type: 'string',
},
fullText: {
type: 'string'
}
},
},
},
Expand Down
6 changes: 5 additions & 1 deletion app/sonar.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { Workspace } from '@arsonar/client'
import type { Collection } from '@arsonar/client'
import Dotenv from 'dotenv'
import { schema } from './schema'
import { extractTextFromFile } from './lib/extract.js'

Dotenv.config()

Expand Down Expand Up @@ -43,8 +44,11 @@ export async function openCollection(): Promise<Collection> {
return collection
}

export async function createBookRecord(data: typeof schema.types.Book.fields) {
export async function createBookRecord(data: any) {
const collection = await openCollection()
const fileId = data.file
const fullText = await extractTextFromFile(collection, fileId)
if (fullText.fullText) data.fullText = fullText.fullText
const record = await collection.put({
type: 'Book',
value: data,
Expand Down
4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,14 @@
"autoprefixer": "^10.4.7",
"dotenv": "^16.0.1",
"npm-run-all": "^4.1.5",
"pdf.js-extract": "^0.2.0",
"postcss": "^8.4.14",
"react": "^17.0.2",
"react-dom": "^17.0.2",
"react-icons": "^4.4.0",
"simple-isbn": "^1.1.5",
"tailwindcss": "^3.1.4"
"tailwindcss": "^3.1.4",
"tmp-promise": "^3.0.3"
},
"devDependencies": {
"@remix-run/dev": "^1.5.1",
Expand Down
20 changes: 19 additions & 1 deletion yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -3458,6 +3458,11 @@ dom-accessibility-api@^0.5.9:
resolved "https://registry.yarnpkg.com/dom-accessibility-api/-/dom-accessibility-api-0.5.14.tgz#56082f71b1dc7aac69d83c4285eef39c15d93f56"
integrity sha512-NMt+m9zFMPZe0JcY9gN224Qvk6qLIdqex29clBvc/y75ZBX9YA9wNK3frsYvu2DI1xcCIwxwnX+TlsJ2DSOADg==

dommatrix@0.0.24:
version "0.0.24"
resolved "https://registry.yarnpkg.com/dommatrix/-/dommatrix-0.0.24.tgz#0b793da372992878b74c31f64ac85e7a13cb382a"
integrity sha512-PatEhAW5pIHr28MvFQGV5iiHNloqvecQZlxs7/8s/eulLqZI3uVqPkrO7YDuqsebovr/9mmcWDSWzVG4amEZgQ==

dotenv@^16.0.0, dotenv@^16.0.1:
version "16.0.1"
resolved "https://registry.yarnpkg.com/dotenv/-/dotenv-16.0.1.tgz#8f8f9d94876c35dac989876a5d3a82a267fdce1d"
Expand Down Expand Up @@ -7711,6 +7716,14 @@ path-type@^4.0.0:
resolved "https://registry.yarnpkg.com/path-type/-/path-type-4.0.0.tgz#84ed01c0a7ba380afe09d90a8c180dcd9d03043b"
integrity sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==

pdf.js-extract@^0.2.0:
version "0.2.0"
resolved "https://registry.yarnpkg.com/pdf.js-extract/-/pdf.js-extract-0.2.0.tgz#162f1aefae6b1fe2aa683057fcd9fd80d44ff58c"
integrity sha512-oUcah0BEsiAtO25SrhG3jzihmiYw9bq+oH0T5NcomKu8Y2pU5j1xco2JPWV2i2vl38nf8cL/MiBu+Fcog6I07A==
dependencies:
dommatrix "0.0.24"
web-streams-polyfill "3.2.0"

peek-stream@^1.1.0:
version "1.1.3"
resolved "https://registry.yarnpkg.com/peek-stream/-/peek-stream-1.1.3.tgz#3b35d84b7ccbbd262fff31dc10da56856ead6d67"
Expand Down Expand Up @@ -9755,7 +9768,7 @@ timeout-refresh@^1.0.0, timeout-refresh@^1.0.2, timeout-refresh@^1.0.3:
resolved "https://registry.yarnpkg.com/timeout-refresh/-/timeout-refresh-1.0.3.tgz#7024a8ce0a09a57acc2ea86002048e6c0bff7375"
integrity sha512-Mz0CX4vBGM5lj8ttbIFt7o4ZMxk/9rgudJRh76EvB7xXZMur7T/cjRiH2w4Fmkq0zxf2QpM8IFvOSRn8FEu3gA==

tmp-promise@^3.0.2:
tmp-promise@^3.0.2, tmp-promise@^3.0.3:
version "3.0.3"
resolved "https://registry.yarnpkg.com/tmp-promise/-/tmp-promise-3.0.3.tgz#60a1a1cc98c988674fcbfd23b6e3367bdeac4ce7"
integrity sha512-RwM7MoPojPxsOBYnyd2hy0bxtIlVrihNs9pj5SUvY8Zz1sQcQG2tG1hSr8PDxfgEB8RNKDhqbIlroIarSNDNsQ==
Expand Down Expand Up @@ -10307,6 +10320,11 @@ web-encoding@1.1.5:
optionalDependencies:
"@zxing/text-encoding" "0.9.0"

web-streams-polyfill@3.2.0:
version "3.2.0"
resolved "https://registry.yarnpkg.com/web-streams-polyfill/-/web-streams-polyfill-3.2.0.tgz#a6b74026b38e4885869fb5c589e90b95ccfc7965"
integrity sha512-EqPmREeOzttaLRm5HS7io98goBgZ7IVz79aDvqjD0kYXLtFZTc0T/U6wHTPKyIjb+MdN7DFIIX6hgdBEpWmfPA==

web-streams-polyfill@^3.1.1:
version "3.2.1"
resolved "https://registry.yarnpkg.com/web-streams-polyfill/-/web-streams-polyfill-3.2.1.tgz#71c2718c52b45fd49dbeee88634b3a60ceab42a6"
Expand Down