Skip to content

Commit 865cb6b

Browse files
committed
can run in browser
1 parent 92ccbd0 commit 865cb6b

File tree

9 files changed

+107
-14
lines changed

9 files changed

+107
-14
lines changed

README.md

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ yarn build
1818

1919
## Use
2020

21-
To use this tool
21+
To use this tool in nodejs, you can use the following code:
2222

2323
```js
2424

@@ -44,6 +44,29 @@ main()
4444

4545
```
4646

47+
In the browser, you can use the following code:
48+
49+
```js
50+
import { SentencePieceProcessor, cleanText, llama_3_1_tokeniser_b64 } from "@sctg/sentencepiece-js";
51+
// built in models: llama_3_1_tokeniser_b64, clean_30k_b64, smart_b64
52+
async function main() {
53+
54+
let text = "I am still waiting on my card?"
55+
let cleaned = cleanText(text)
56+
57+
let spp = new SentencePieceProcessor()
58+
await spp.lloadFromB64StringModel(llama_3_1_tokeniser_b64);
59+
let ids = spp.encodeIds(cleaned)
60+
console.log(ids)
61+
let str = spp.decodeIds(ids) // list ids->number
62+
console.log(str)
63+
64+
let pieces = spp.encodePieces(cleaned) // list tokens->string
65+
console.log(pieces)
66+
}
67+
main()
68+
```
69+
4770
## Note
4871

4972
- devilyouwei updated this repo to make this module support the js `require` keyword and added the using example.

package.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@sctg/sentencepiece-js",
3-
"version": "1.2.0",
3+
"version": "1.3.0",
44
"description": "Sentencepiece tokenization for natural language processing, JS version.",
55
"main": "dist/index.js",
66
"exports": {
@@ -11,12 +11,12 @@
1111
"build": "./build.sh; rollup --config",
1212
"test": "web-test-runner \"test/**/*.test.js\" \"src/**/*.test.js\" --node-resolve",
1313
"test:watch": "web-test-runner \"test/**/*.test.js\" \"src/**/*.test.js\" --node-resolve --watch",
14-
"develop": "web-dev-server --node-resolve --watch --open"
14+
"develop": "web-dev-server --node-resolve --watch --open",
15+
"convert_models": "node ./test/convertmodels.js"
1516
},
1617
"files": [
1718
"./dist/index.js",
18-
"./dist/index.d.ts",
19-
"./dist/llama-3.1-tokenizer.model"
19+
"./dist/index.d.ts"
2020
],
2121
"repository": {
2222
"type": "git",

src/clean_30k.ts

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

src/index.ts

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
import { SentencePieceProcessor, cleanText } from "./sentencePieceProcessor";
2+
import { llama_3_1_tokeniser_b64 } from "./llama_3_1_tokeniser_model";
3+
import { smart_b64 } from "./smart";
4+
import { clean_30k_b64 } from "./clean_30k";
25

3-
export { SentencePieceProcessor, cleanText }
4-
export default { SentencePieceProcessor, cleanText }
6+
export { SentencePieceProcessor, cleanText, llama_3_1_tokeniser_b64, clean_30k_b64, smart_b64 };
7+
export default { SentencePieceProcessor, cleanText };

src/llama_3_1_tokeniser_model.ts

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

src/sentencePieceProcessor.ts

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,34 @@ export class SentencePieceProcessor {
66
processor: any;
77
sentencepiece: any;
88

9+
uuidv4(): string {
10+
return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function (c) {
11+
var r = Math.random() * 16 | 0,
12+
v = c == 'x' ? r : (r & 0x3 | 0x8);
13+
return v.toString(16);
14+
});
15+
}
16+
17+
// load model from a base64 encoded string
18+
async loadFromB64StringModel(b64model: string) {
19+
// decode base64 string
20+
const model = Buffer.from(b64model, 'base64');
21+
await this._loadModel(model);
22+
}
23+
924
// load model
1025
async load(url: string) {
26+
const model = fs.readFileSync(url);
27+
await this._loadModel(model);
28+
}
1129

12-
this.sentencepiece = await Module();
1330

14-
// change to fs read model file
15-
this.sentencepiece.FS.writeFile("sentencepiece.model", fs.readFileSync(url));
16-
const string_view = new this.sentencepiece.StringView("sentencepiece.model");
31+
// private function to load model
32+
private async _loadModel(model: Buffer) {
33+
const tempName = this.uuidv4() + ".model";
34+
this.sentencepiece = await Module();
35+
this.sentencepiece.FS.writeFile(tempName, model);
36+
const string_view = new this.sentencepiece.StringView(tempName);
1737
const absl_string_view = string_view.getView();
1838

1939
this.processor = new this.sentencepiece.SentencePieceProcessor();
@@ -22,10 +42,9 @@ export class SentencePieceProcessor {
2242
load_status.delete();
2343
absl_string_view.delete();
2444
string_view.delete();
25-
45+
this.sentencepiece.FS.unlink(tempName);
2646
}
2747

28-
2948
encodeIds(text: string) {
3049

3150
const string_view = new this.sentencepiece.StringView(text);

src/smart.ts

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

src/test.js

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
const { SentencePieceProcessor, cleanText } = require("../dist");
1+
const { SentencePieceProcessor, cleanText, llama_3_1_tokeniser_b64 } = require("../dist");
22
const ROOT = require('app-root-path')
33

44
async function main() {
@@ -10,10 +10,22 @@ async function main() {
1010
await spp.load(`${ROOT}/test/llama-3.1-tokenizer.model`)
1111
let ids = spp.encodeIds(cleaned);
1212
console.log(ids)
13+
console.log(`Token length: ${ids.length}`)
1314
let str = spp.decodeIds(ids)
1415
console.log(str)
1516

1617
let pieces = spp.encodePieces(cleaned);
1718
console.log(pieces)
19+
20+
let spp2 = new SentencePieceProcessor();
21+
await spp2.loadFromB64StringModel(llama_3_1_tokeniser_b64);
22+
let ids2 = spp2.encodeIds(cleaned);
23+
console.log(ids2)
24+
console.log(`Token length: ${ids2.length}`)
25+
let str2 = spp2.decodeIds(ids2)
26+
console.log(str2)
27+
28+
let pieces2 = spp2.encodePieces(cleaned);
29+
console.log(pieces2);
1830
}
1931
main()

test/convertmodels.js

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
const fs = require('fs');
2+
const path = require('path');
3+
4+
function convertMoldel(filePath, output, variableName) {
5+
fs.readFile(filePath, (err, data) => {
6+
if (err) {
7+
console.error('Erreur lors de la lecture du fichier binaire:', err);
8+
return;
9+
}
10+
11+
// Convertir le contenu en base64
12+
const base64Content = data.toString('base64');
13+
14+
// Créer la chaîne de caractères contenant l'exportation de la constante
15+
const outputContent = `export const ${variableName} = "${base64Content}";`;
16+
17+
// Chemin vers le fichier de sortie
18+
const outputFilePath = path.join(__dirname, output);
19+
20+
// Écrire la chaîne dans le fichier de sortie
21+
fs.writeFile(outputFilePath, outputContent, (err) => {
22+
if (err) {
23+
console.error('Erreur lors de l\'écriture du fichier de sortie:', err);
24+
return;
25+
}
26+
console.log('Fichier de sortie généré avec succès:', outputFilePath);
27+
});
28+
});
29+
}
30+
31+
convertMoldel('./test/llama-3.1-tokenizer.model', '../src/llama_3_1_tokeniser_model.ts', 'llama_3_1_tokeniser_b64');
32+
convertMoldel('./test/30k-clean.model', '../src/clean_30k.ts', 'clean_30k_b64');
33+
convertMoldel('./test/smart.model', '../src/smart.ts', 'smart_b64');

0 commit comments

Comments
 (0)