From 57238ce1cd78d2405779887ca48fbf2994c0f918 Mon Sep 17 00:00:00 2001
From: Emberstar <969242373@qq.com>
Date: Mon, 2 Oct 2023 00:00:14 +0800
Subject: [PATCH] update: docs

---
 README.md    | 203 +++-----------------------------------------------
 README_zh.md | 205 +++------------------------------------------------
 2 files changed, 22 insertions(+), 386 deletions(-)
diff --git a/README.md b/README.md
index b1b6cd4..f4d06a7 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
     <br/>
     <p>
         <img src="https://img.shields.io/github/license/Artrajz/vits-simple-api">
-    	<img src="https://img.shields.io/badge/python-3.9%7C3.10-green">
+    	<img src="https://img.shields.io/badge/python-3.10-green">
         <a href="https://hub.docker.com/r/artrajz/vits-simple-api">
             <img src="https://img.shields.io/docker/pulls/artrajz/vits-simple-api"></a>
     </p>
@@ -16,6 +16,7 @@
 
 
 
+
 # Feature
 
 - [x] VITS text-to-speech, voice conversion
@@ -46,7 +47,7 @@ https://user-images.githubusercontent.com/73542220/237995061-c1f25b4e-dd86-438a-
 
 # Deploy
 
-## Docker
+## Docker(Recommended for Linux)
 
 ### Docker image pull script
 
@@ -140,17 +141,15 @@ Run the docker image pull script again
 
 ###  Download python dependencies 
 
-A python virtual environment is recommended，use python >= 3.9
+A python virtual environment is recommended
 
 `pip install -r requirements.txt`
 
 Fasttext may not be installed on windows, you can install it with the following command,or download wheels [here](https://www.lfd.uci.edu/~gohlke/pythonlibs/#fasttext)
 
 ```
-#python3.10 win_amd64
+# python3.10 win_amd64
 pip install https://github.com/Artrajz/archived/raw/main/fasttext/fasttext-0.9.2-cp310-cp310-win_amd64.whl
-#python3.9 win_amd64
-pip install https://github.com/Artrajz/archived/raw/main/fasttext/fasttext-0.9.2-cp39-cp39-win_amd64.whl
 ```
 
 ### Download  VITS model 
@@ -228,10 +227,12 @@ nvidia-smi
 ```
 Taking CUDA 11.7 as an example, download it from the [official website](https://developer.nvidia.com/cuda-11-7-0-download-archive?target_os=Windows&amp;target_arch=x86_64&amp;target_version=10&amp;target_type=exe_local)
 ### Install GPU version of PyTorch
+
+1.13.1+cu117 is recommended, other versions may have memory instability issues.
+
 ```
-pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
+pip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117
 ```
-You can find the corresponding command for the version you need on the [official website](https://pytorch.org/get-started/locally/)
 ## Linux
 The installation process is similar, but I don't have the environment to test it.
 
@@ -273,190 +274,7 @@ pip install pyopenjtalk -i https://pypi.artrajz.cn/simple
 
 ## POST
 
-- python
-
-```python
-import re
-import requests
-import os
-import random
-import string
-from requests_toolbelt.multipart.encoder import MultipartEncoder
-
-abs_path = os.path.dirname(__file__)
-base = "http://127.0.0.1:23456"
-
-
-# 映射表
-def voice_speakers():
-    url = f"{base}/voice/speakers"
-
-    res = requests.post(url=url)
-    json = res.json()
-    for i in json:
-        print(i)
-        for j in json[i]:
-            print(j)
-    return json
-
-
-# 语音合成 voice vits
-def voice_vits(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, max=50):
-    fields = {
-        "text": text,
-        "id": str(id),
-        "format": format,
-        "lang": lang,
-        "length": str(length),
-        "noise": str(noise),
-        "noisew": str(noisew),
-        "max": str(max)
-    }
-    boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
-
-    m = MultipartEncoder(fields=fields, boundary=boundary)
-    headers = {"Content-Type": m.content_type}
-    url = f"{base}/voice"
-
-    res = requests.post(url=url, data=m, headers=headers)
-    fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
-    path = f"{abs_path}/{fname}"
-
-    with open(path, "wb") as f:
-        f.write(res.content)
-    print(path)
-    return path
-
-
-# 语音转换 hubert-vits
-def voice_hubert_vits(upload_path, id, format="wav", length=1, noise=0.667, noisew=0.8):
-    upload_name = os.path.basename(upload_path)
-    upload_type = f'audio/{upload_name.split(".")[1]}'  # wav,ogg
-
-    with open(upload_path, 'rb') as upload_file:
-        fields = {
-            "upload": (upload_name, upload_file, upload_type),
-            "id": str(id),
-            "format": format,
-            "length": str(length),
-            "noise": str(noise),
-            "noisew": str(noisew),
-        }
-        boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
-
-        m = MultipartEncoder(fields=fields, boundary=boundary)
-        headers = {"Content-Type": m.content_type}
-        url = f"{base}/voice/hubert-vits"
-
-        res = requests.post(url=url, data=m, headers=headers)
-    fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
-    path = f"{abs_path}/{fname}"
-
-    with open(path, "wb") as f:
-        f.write(res.content)
-    print(path)
-    return path
-
-
-# 维度情感模型 w2v2-vits
-def voice_w2v2_vits(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, max=50, emotion=0):
-    fields = {
-        "text": text,
-        "id": str(id),
-        "format": format,
-        "lang": lang,
-        "length": str(length),
-        "noise": str(noise),
-        "noisew": str(noisew),
-        "max": str(max),
-        "emotion": str(emotion)
-    }
-    boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
-
-    m = MultipartEncoder(fields=fields, boundary=boundary)
-    headers = {"Content-Type": m.content_type}
-    url = f"{base}/voice/w2v2-vits"
-
-    res = requests.post(url=url, data=m, headers=headers)
-    fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
-    path = f"{abs_path}/{fname}"
-
-    with open(path, "wb") as f:
-        f.write(res.content)
-    print(path)
-    return path
-
-
-# 语音转换 同VITS模型内角色之间的音色转换
-def voice_conversion(upload_path, original_id, target_id):
-    upload_name = os.path.basename(upload_path)
-    upload_type = f'audio/{upload_name.split(".")[1]}'  # wav,ogg
-
-    with open(upload_path, 'rb') as upload_file:
-        fields = {
-            "upload": (upload_name, upload_file, upload_type),
-            "original_id": str(original_id),
-            "target_id": str(target_id),
-        }
-        boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
-        m = MultipartEncoder(fields=fields, boundary=boundary)
-
-        headers = {"Content-Type": m.content_type}
-        url = f"{base}/voice/conversion"
-
-        res = requests.post(url=url, data=m, headers=headers)
-
-    fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
-    path = f"{abs_path}/{fname}"
-
-    with open(path, "wb") as f:
-        f.write(res.content)
-    print(path)
-    return path
-
-
-def voice_ssml(ssml):
-    fields = {
-        "ssml": ssml,
-    }
-    boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
-
-    m = MultipartEncoder(fields=fields, boundary=boundary)
-    headers = {"Content-Type": m.content_type}
-    url = f"{base}/voice/ssml"
-
-    res = requests.post(url=url, data=m, headers=headers)
-    fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
-    path = f"{abs_path}/{fname}"
-
-    with open(path, "wb") as f:
-        f.write(res.content)
-    print(path)
-    return path
-
-def voice_dimensional_emotion(upload_path):
-    upload_name = os.path.basename(upload_path)
-    upload_type = f'audio/{upload_name.split(".")[1]}'  # wav,ogg
-
-    with open(upload_path, 'rb') as upload_file:
-        fields = {
-            "upload": (upload_name, upload_file, upload_type),
-        }
-        boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
-
-        m = MultipartEncoder(fields=fields, boundary=boundary)
-        headers = {"Content-Type": m.content_type}
-        url = f"{base}/voice/dimension-emotion"
-
-        res = requests.post(url=url, data=m, headers=headers)
-    fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
-    path = f"{abs_path}/{fname}"
-
-    with open(path, "wb") as f:
-        f.write(res.content)
-    print(path)
-    return path
-```
+- See `api_test.py`
 
 ## API KEY
 
@@ -614,4 +432,5 @@ Learning and communication,now there is only Chinese [QQ group](https://qm.qq.co
 - emotional-vits:https://github.com/innnky/emotional-vits
 - vits-uma-genshin-honkai:https://huggingface.co/spaces/zomehwh/vits-uma-genshin-honkai
 - vits_chinese:https://github.com/PlayVoice/vits_chinese
+- Bert_VITS2:https://github.com/fishaudio/Bert-VITS2
 
diff --git a/README_zh.md b/README_zh.md
index 80f5143..cbe5f40 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -5,7 +5,7 @@
     <br/>
     <p>
         <img src="https://img.shields.io/github/license/Artrajz/vits-simple-api">
-    	<img src="https://img.shields.io/badge/python-3.9%7C3.10-green">
+    	<img src="https://img.shields.io/badge/python-3.10-green">
         <a href="https://hub.docker.com/r/artrajz/vits-simple-api">
             <img src="https://img.shields.io/docker/pulls/artrajz/vits-simple-api"></a>
     </p>
@@ -16,6 +16,7 @@
 
 
 
+
 # Feature
 
 - [x] VITS语音合成，语音转换
@@ -48,7 +49,7 @@ https://user-images.githubusercontent.com/73542220/237995061-c1f25b4e-dd86-438a-
 
 # 部署
 
-## Docker部署
+## Docker部署（Linux推荐）
 
 ### 镜像拉取脚本
 
@@ -137,17 +138,15 @@ DIMENSIONAL_EMOTION_MODEL = ABS_PATH + "/Model/model.yaml"
 
 ###  下载python依赖
 
-推荐使用python的虚拟环境，python版本 >= 3.9
+推荐使用python的虚拟环境
 
 `pip install -r requirements.txt`
 
 windows下可能安装不了fasttext,可以用以下命令安装，附[wheels下载地址](https://www.lfd.uci.edu/~gohlke/pythonlibs/#fasttext)
 
 ```
-#python3.10 win_amd64
+# python3.10 win_amd64
 pip install https://github.com/Artrajz/archived/raw/main/fasttext/fasttext-0.9.2-cp310-cp310-win_amd64.whl
-#python3.9 win_amd64
-pip install https://github.com/Artrajz/archived/raw/main/fasttext/fasttext-0.9.2-cp39-cp39-win_amd64.whl
 ```
 
 ### 下载VITS模型
@@ -222,14 +221,12 @@ nvidia-smi
 
 ### 安装GPU版pytorch
 
-CUDA11.7对应的pytorch是用这个命令安装
+CUDA11.7对应的pytorch是用这个命令安装，推荐使用1.13.1+cu117，其他版本可能存在内存不稳定的问题。
 
 ```
-pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
+pip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117
 ```
 
-对应版本的命令可以在[官网](https://pytorch.org/get-started/locally/)找到
-
 ## Linux
 
 安装过程类似，但我没有相应的环境所以没办法测试
@@ -272,190 +269,9 @@ pip install pyopenjtalk -i https://pypi.artrajz.cn/simple
 
 ## POST
 
-- python
-
-```python
-import re
-import requests
-import os
-import random
-import string
-from requests_toolbelt.multipart.encoder import MultipartEncoder
-
-abs_path = os.path.dirname(__file__)
-base = "http://127.0.0.1:23456"
-
-
-# 映射表
-def voice_speakers():
-    url = f"{base}/voice/speakers"
-
-    res = requests.post(url=url)
-    json = res.json()
-    for i in json:
-        print(i)
-        for j in json[i]:
-            print(j)
-    return json
-
-
-# 语音合成 voice vits
-def voice_vits(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, max=50):
-    fields = {
-        "text": text,
-        "id": str(id),
-        "format": format,
-        "lang": lang,
-        "length": str(length),
-        "noise": str(noise),
-        "noisew": str(noisew),
-        "max": str(max)
-    }
-    boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
-
-    m = MultipartEncoder(fields=fields, boundary=boundary)
-    headers = {"Content-Type": m.content_type}
-    url = f"{base}/voice"
-
-    res = requests.post(url=url, data=m, headers=headers)
-    fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
-    path = f"{abs_path}/{fname}"
-
-    with open(path, "wb") as f:
-        f.write(res.content)
-    print(path)
-    return path
-
-
-# 语音转换 hubert-vits
-def voice_hubert_vits(upload_path, id, format="wav", length=1, noise=0.667, noisew=0.8):
-    upload_name = os.path.basename(upload_path)
-    upload_type = f'audio/{upload_name.split(".")[1]}'  # wav,ogg
-
-    with open(upload_path, 'rb') as upload_file:
-        fields = {
-            "upload": (upload_name, upload_file, upload_type),
-            "id": str(id),
-            "format": format,
-            "length": str(length),
-            "noise": str(noise),
-            "noisew": str(noisew),
-        }
-        boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
-
-        m = MultipartEncoder(fields=fields, boundary=boundary)
-        headers = {"Content-Type": m.content_type}
-        url = f"{base}/voice/hubert-vits"
-
-        res = requests.post(url=url, data=m, headers=headers)
-    fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
-    path = f"{abs_path}/{fname}"
-
-    with open(path, "wb") as f:
-        f.write(res.content)
-    print(path)
-    return path
-
-
-# 维度情感模型 w2v2-vits
-def voice_w2v2_vits(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, max=50, emotion=0):
-    fields = {
-        "text": text,
-        "id": str(id),
-        "format": format,
-        "lang": lang,
-        "length": str(length),
-        "noise": str(noise),
-        "noisew": str(noisew),
-        "max": str(max),
-        "emotion": str(emotion)
-    }
-    boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
-
-    m = MultipartEncoder(fields=fields, boundary=boundary)
-    headers = {"Content-Type": m.content_type}
-    url = f"{base}/voice/w2v2-vits"
-
-    res = requests.post(url=url, data=m, headers=headers)
-    fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
-    path = f"{abs_path}/{fname}"
-
-    with open(path, "wb") as f:
-        f.write(res.content)
-    print(path)
-    return path
-
-
-# 语音转换 同VITS模型内角色之间的音色转换
-def voice_conversion(upload_path, original_id, target_id):
-    upload_name = os.path.basename(upload_path)
-    upload_type = f'audio/{upload_name.split(".")[1]}'  # wav,ogg
-
-    with open(upload_path, 'rb') as upload_file:
-        fields = {
-            "upload": (upload_name, upload_file, upload_type),
-            "original_id": str(original_id),
-            "target_id": str(target_id),
-        }
-        boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
-        m = MultipartEncoder(fields=fields, boundary=boundary)
-
-        headers = {"Content-Type": m.content_type}
-        url = f"{base}/voice/conversion"
-
-        res = requests.post(url=url, data=m, headers=headers)
-
-    fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
-    path = f"{abs_path}/{fname}"
-
-    with open(path, "wb") as f:
-        f.write(res.content)
-    print(path)
-    return path
-
-
-def voice_ssml(ssml):
-    fields = {
-        "ssml": ssml,
-    }
-    boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
-
-    m = MultipartEncoder(fields=fields, boundary=boundary)
-    headers = {"Content-Type": m.content_type}
-    url = f"{base}/voice/ssml"
-
-    res = requests.post(url=url, data=m, headers=headers)
-    fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
-    path = f"{abs_path}/{fname}"
-
-    with open(path, "wb") as f:
-        f.write(res.content)
-    print(path)
-    return path
-
-def voice_dimensional_emotion(upload_path):
-    upload_name = os.path.basename(upload_path)
-    upload_type = f'audio/{upload_name.split(".")[1]}'  # wav,ogg
-
-    with open(upload_path, 'rb') as upload_file:
-        fields = {
-            "upload": (upload_name, upload_file, upload_type),
-        }
-        boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
-
-        m = MultipartEncoder(fields=fields, boundary=boundary)
-        headers = {"Content-Type": m.content_type}
-        url = f"{base}/voice/dimension-emotion"
-
-        res = requests.post(url=url, data=m, headers=headers)
-    fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
-    path = f"{abs_path}/{fname}"
-
-    with open(path, "wb") as f:
-        f.write(res.content)
-    print(path)
-    return path
-```
+- 见`api_test.py`
+
+
 
 ## API KEY
 
@@ -613,4 +429,5 @@ def voice_dimensional_emotion(upload_path):
 - emotional-vits:https://github.com/innnky/emotional-vits
 - vits-uma-genshin-honkai:https://huggingface.co/spaces/zomehwh/vits-uma-genshin-honkai
 - vits_chinese:https://github.com/PlayVoice/vits_chinese
+- Bert_VITS2:https://github.com/fishaudio/Bert-VITS2