Add homepage and keywords to packages

Systemcluster · Dec 20, 2024 · 7ceb134 · 7ceb134
1 parent e8213f7
commit 7ceb134
Show file tree

Hide file tree

Showing 6 changed files with 34 additions and 18 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -7,6 +7,7 @@ authors = ["Christian Sdunek <[email protected]>"]
 keywords = ["tokenizer", "nlp", "bpe", "unigram", "wordpiece"]
 categories = ["text-processing", "algorithms", "wasm", "no-std", "parser-implementations"]
 license = "BSD-2-Clause"
+homepage = "https://kitoken.dev"
 repository = "https://github.com/Systemcluster/kitoken"
 readme = "README.md"
 edition = "2021"

diff --git a/packages/javascript/README.md b/packages/javascript/README.md
@@ -5,8 +5,8 @@
 ```js
 import { Kitoken } from "kitoken/node"
 
-const model = fs.readFileSync("models/llama2.kit")
-const encoder = Kitoken.from_sentencepiece(model)
+const model = fs.readFileSync("models/llama3.3.model")
+const encoder = new Kitoken(model)
 
 const tokens = encoder.encode("hello world!", true)
 const string = TextDecoder().decode(encoder.decode(tokens))
@@ -27,7 +27,7 @@ const string = TextDecoder().decode(encoder.decode(tokens))
 
 ## Overview
 
-Kitoken is a fast and versatile tokenizer for language models. Multiple tokenization algorithms are supported:
+Kitoken is a fast and versatile tokenizer for language models with support for multiple tokenization algorithms:
 
 - **BytePair**: A variation of the BPE algorithm, merging byte or character pairs.
 - **Unigram**: The Unigram subword algorithm.
@@ -38,11 +38,12 @@ including [SentencePiece](https://github.com/google/sentencepiece), [HuggingFace
 
 The JavaScript package provides multiple exports:
 
-- `kitoken`: The default export, importing the WebAssembly file directly. Usable with Webpack and other bundlers.
-- `kitoken/node`: Uses Node.js functions to read the WebAssembly file from the file system. Provides support for additional split strategies and regex optimizations.
-- `kitoken/web`: Usable with web browsers, uses `new URL(..., import.meta.url)` to load the WebAssembly file.
-
-- `kitoken/minimal`: Smallest file size. Similar to the default export, but only supports initialization from `.kit` definitions.
-- `kitoken/full`: Largest file size. Similar to the default export, but provides support for additional split strategies and regex optimizations.
+| Export          | Description                                                                                           |
+|-----------------|-------------------------------------------------------------------------------------------------------|
+| `kitoken`       | The default export, importing the WebAssembly file directly. Usable with Webpack and other bundlers.  |
+| `kitoken/node`  | Uses Node.js functions to read the WebAssembly file from the file system. Provides support for additional split strategies and regex optimizations. |
+| `kitoken/web`   | Usable with web browsers, uses `new URL(..., import.meta.url)` to load the WebAssembly file.           |
+| `kitoken/minimal`| Smallest file size. Similar to the default export, but only supports initialization from `.kit` definitions. |
+| `kitoken/full`  | Largest file size. Similar to the default export, but provides support for additional split strategies and regex optimizations. |
 
 See the main [README](//github.com/Systemcluster/kitoken) for more information.
diff --git a/packages/javascript/package.json b/packages/javascript/package.json
@@ -8,11 +8,19 @@
         "email": "[email protected]"
     },
     "license": "BSD-2-Clause",
+    "homepage": "https://kitoken.dev",
     "repository": {
         "type": "git",
         "url": "github:Systemcluster/kitoken"
     },
     "readme": "README.md",
+    "keywords": [
+        "tokenizer",
+        "nlp",
+        "bpe",
+        "unigram",
+        "wordpiece"
+    ],
     "engines": {
         "node": ">=18"
     },

diff --git a/packages/javascript/test.js b/packages/javascript/test.js
@@ -6,7 +6,7 @@ import { Kitoken } from "kitoken/node";
 console.debug(Kitoken);
 
 const model = fs.readFileSync("../../tests/models/sentencepiece/llama2.model");
-const encoder = Kitoken.from_sentencepiece(model);
+const encoder = new Kitoken(model);
 console.debug(encoder);
 
 const en = encoder.encode("hello world!", true);
@@ -17,11 +17,11 @@ console.debug(new TextDecoder().decode(de));
 assert.equal(new TextDecoder().decode(de), "hello world!");
 
 const text = new TextDecoder().decode(
-	fs.readFileSync("../../benches/data/wagahai.txt"),
+    fs.readFileSync("../../benches/data/wagahai.txt"),
 );
 const now = Date.now();
 for (let i = 0; i < 100; i++) {
-	const _ = encoder.encode(text, true);
+    const _ = encoder.encode(text, true);
 }
 console.info(`100 iterations in ${(Date.now() - now).toFixed(3)}ms`);
 
@@ -40,7 +40,7 @@ assert.equal(new TextDecoder().decode(demu[1]), "hello world!");
 const t = encoder.encode("Kitoken. Tokenize Everything!", true);
 console.debug(t);
 console.debug(new TextDecoder().decode(encoder.decode(t)));
-console.debug(encoder.decode_all([...t].map(x=>[x])).map(x=>new TextDecoder().decode(x)));
+console.debug(encoder.decode_all([...t].map(x => [x])).map(x => new TextDecoder().decode(x)));
 
 encoder.to_bytes()
 console.info("OK");
diff --git a/packages/python/README.md b/packages/python/README.md
@@ -5,10 +5,10 @@
 ```py
 from kitoken import Kitoken
 
-const encoder = Kitoken.from_file("models/llama2.kit")
+encoder = Kitoken.from_file("models/llama3.3.model")
 
-const tokens = encoder.encode("hello world!", True)
-const string = encoder.decode(tokens).decode("utf-8")
+tokens = encoder.encode("hello world!", True)
+string = encoder.decode(tokens).decode("utf-8")
 
 assert string == "hello world!"
 ```
@@ -28,7 +28,7 @@ assert string == "hello world!"
 
 ## Overview
 
-Kitoken is a fast and versatile tokenizer for language models. Multiple tokenization algorithms are supported:
+Kitoken is a fast and versatile tokenizer for language models with support for multiple tokenization algorithms:
 
 - **BytePair**: A variation of the BPE algorithm, merging byte or character pairs.
 - **Unigram**: The Unigram subword algorithm.

diff --git a/packages/python/pyproject.toml b/packages/python/pyproject.toml
@@ -23,11 +23,17 @@ classifiers = [
     "License :: OSI Approved :: BSD License",
     "Typing :: Typed",
 ]
+keywords = ["tokenizer", "nlp", "bpe", "unigram", "wordpiece"]
 authors = [{ name = "Christian Sdunek", email = "[email protected]" }]
-urls = { repository = "https://github.com/Systemcluster/kitoken" }
 readme = "README.md"
 
 
+[project.urls]
+
+Homepage = "https://kitoken.dev"
+Repository = "https://github.com/Systemcluster/kitoken"
+
+
 [build-system]
 
 requires = [