feat: use llama3.2 vision for image to text task

LeafYeeXYZ · Dec 4, 2024 · a9520cf · a9520cf
1 parent 9f1c228
commit a9520cf
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -18,6 +18,8 @@ A image creator based on **free** `Cloudflare AI` and `HuggingFace` APIs. Featur
 
 You can use either `Fullstack` or `Client-Server` mode.
 
+> You may need to initialize `Cloudflare AI` `llama3.2 11B vision` model before using `Image-to-Prompt` feature. See [here](https://developers.cloudflare.com/workers-ai/models/llama-3.2-11b-vision-instruct/#Input) for more information.
+
 #### 1.1.1 Fullstack
 
 Set following environment variables in `.env` file or `Vercel`.

diff --git a/app/api/prompt/route.ts b/app/api/prompt/route.ts
@@ -1,11 +1,11 @@
 export async function POST(req: Request): Promise<Response> {
   try {
     const { image } = await req.json()
-    const url = `https://api.cloudflare.com/client/v4/accounts/${process.env.CF_USER_ID}/ai/run/@cf/unum/uform-gen2-qwen-500m`
+    const url = `https://api.cloudflare.com/client/v4/accounts/${process.env.CF_USER_ID}/ai/run/@cf/meta/llama-3.2-11b-vision-instruct`
     const body = {
       image: image as number[],
       max_tokens: 4096,
-      prompt: 'Generate a detailed description in a single paragraph for this image',
+      prompt: 'Analyze the given image and provide a detailed description. Include details about the main subject/people, background, colors, composition, and mood. Ensure the description is vivid and suitable for input into a text-to-image generation model.',
     }
     const response = await fetch(url, {
       method: 'POST',

diff --git a/app/components/Prompt.tsx b/app/components/Prompt.tsx
@@ -86,7 +86,7 @@ export default function Prompt() {
               showUploadList={false}
               accept='.jpg,.jpeg,.png'
               beforeUpload={async (file) => {
-                const MAX_SIZE_MB = 2
+                const MAX_SIZE_MB = 5
                 try {
                   flushSync(() => setDisabled(true))
                   if (file.size > MAX_SIZE_MB * 1024 * 1024) {
@@ -96,7 +96,7 @@ export default function Prompt() {
                   const uint8array = new Uint8Array(await file.arrayBuffer())
                   let res: Response | undefined
                   if (process.env.NEXT_PUBLIC_WORKERS_SERVER) {
-                    res = await fetch(`${process.env.NEXT_PUBLIC_WORKERS_SERVER}/painter/genprompt`, {
+                    res = await fetch(`${process.env.NEXT_PUBLIC_WORKERS_SERVER}/painter/genprompt/v4`, {
                       method: 'POST',
                       body: JSON.stringify({ image: Array.from(uint8array) })
                     })
@@ -111,7 +111,7 @@ export default function Prompt() {
                     return false
                   }
                   const data = await res.json()
-                  const prompt = data.result.description as string
+                  const prompt = data.result.response as string
                   form.setFieldsValue({ prompt })
                   return false
                 } finally {