From 57238ce1cd78d2405779887ca48fbf2994c0f918 Mon Sep 17 00:00:00 2001
From: Emberstar <969242373@qq.com>
Date: Mon, 2 Oct 2023 00:00:14 +0800
Subject: [PATCH] update: docs
---
README.md | 203 +++-----------------------------------------------
README_zh.md | 205 +++------------------------------------------------
2 files changed, 22 insertions(+), 386 deletions(-)
diff --git a/README.md b/README.md
index b1b6cd4..f4d06a7 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
-
+
@@ -16,6 +16,7 @@
+
# Feature
- [x] VITS text-to-speech, voice conversion
@@ -46,7 +47,7 @@ https://user-images.githubusercontent.com/73542220/237995061-c1f25b4e-dd86-438a-
# Deploy
-## Docker
+## Docker(Recommended for Linux)
### Docker image pull script
@@ -140,17 +141,15 @@ Run the docker image pull script again
### Download python dependencies
-A python virtual environment is recommended,use python >= 3.9
+A python virtual environment is recommended
`pip install -r requirements.txt`
Fasttext may not be installed on windows, you can install it with the following command,or download wheels [here](https://www.lfd.uci.edu/~gohlke/pythonlibs/#fasttext)
```
-#python3.10 win_amd64
+# python3.10 win_amd64
pip install https://github.com/Artrajz/archived/raw/main/fasttext/fasttext-0.9.2-cp310-cp310-win_amd64.whl
-#python3.9 win_amd64
-pip install https://github.com/Artrajz/archived/raw/main/fasttext/fasttext-0.9.2-cp39-cp39-win_amd64.whl
```
### Download VITS model
@@ -228,10 +227,12 @@ nvidia-smi
```
Taking CUDA 11.7 as an example, download it from the [official website](https://developer.nvidia.com/cuda-11-7-0-download-archive?target_os=Windows&target_arch=x86_64&target_version=10&target_type=exe_local)
### Install GPU version of PyTorch
+
+1.13.1+cu117 is recommended, other versions may have memory instability issues.
+
```
-pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
+pip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117
```
-You can find the corresponding command for the version you need on the [official website](https://pytorch.org/get-started/locally/)
## Linux
The installation process is similar, but I don't have the environment to test it.
@@ -273,190 +274,7 @@ pip install pyopenjtalk -i https://pypi.artrajz.cn/simple
## POST
-- python
-
-```python
-import re
-import requests
-import os
-import random
-import string
-from requests_toolbelt.multipart.encoder import MultipartEncoder
-
-abs_path = os.path.dirname(__file__)
-base = "http://127.0.0.1:23456"
-
-
-# 映射表
-def voice_speakers():
- url = f"{base}/voice/speakers"
-
- res = requests.post(url=url)
- json = res.json()
- for i in json:
- print(i)
- for j in json[i]:
- print(j)
- return json
-
-
-# 语音合成 voice vits
-def voice_vits(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, max=50):
- fields = {
- "text": text,
- "id": str(id),
- "format": format,
- "lang": lang,
- "length": str(length),
- "noise": str(noise),
- "noisew": str(noisew),
- "max": str(max)
- }
- boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
-
- m = MultipartEncoder(fields=fields, boundary=boundary)
- headers = {"Content-Type": m.content_type}
- url = f"{base}/voice"
-
- res = requests.post(url=url, data=m, headers=headers)
- fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
- path = f"{abs_path}/{fname}"
-
- with open(path, "wb") as f:
- f.write(res.content)
- print(path)
- return path
-
-
-# 语音转换 hubert-vits
-def voice_hubert_vits(upload_path, id, format="wav", length=1, noise=0.667, noisew=0.8):
- upload_name = os.path.basename(upload_path)
- upload_type = f'audio/{upload_name.split(".")[1]}' # wav,ogg
-
- with open(upload_path, 'rb') as upload_file:
- fields = {
- "upload": (upload_name, upload_file, upload_type),
- "id": str(id),
- "format": format,
- "length": str(length),
- "noise": str(noise),
- "noisew": str(noisew),
- }
- boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
-
- m = MultipartEncoder(fields=fields, boundary=boundary)
- headers = {"Content-Type": m.content_type}
- url = f"{base}/voice/hubert-vits"
-
- res = requests.post(url=url, data=m, headers=headers)
- fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
- path = f"{abs_path}/{fname}"
-
- with open(path, "wb") as f:
- f.write(res.content)
- print(path)
- return path
-
-
-# 维度情感模型 w2v2-vits
-def voice_w2v2_vits(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, max=50, emotion=0):
- fields = {
- "text": text,
- "id": str(id),
- "format": format,
- "lang": lang,
- "length": str(length),
- "noise": str(noise),
- "noisew": str(noisew),
- "max": str(max),
- "emotion": str(emotion)
- }
- boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
-
- m = MultipartEncoder(fields=fields, boundary=boundary)
- headers = {"Content-Type": m.content_type}
- url = f"{base}/voice/w2v2-vits"
-
- res = requests.post(url=url, data=m, headers=headers)
- fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
- path = f"{abs_path}/{fname}"
-
- with open(path, "wb") as f:
- f.write(res.content)
- print(path)
- return path
-
-
-# 语音转换 同VITS模型内角色之间的音色转换
-def voice_conversion(upload_path, original_id, target_id):
- upload_name = os.path.basename(upload_path)
- upload_type = f'audio/{upload_name.split(".")[1]}' # wav,ogg
-
- with open(upload_path, 'rb') as upload_file:
- fields = {
- "upload": (upload_name, upload_file, upload_type),
- "original_id": str(original_id),
- "target_id": str(target_id),
- }
- boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
- m = MultipartEncoder(fields=fields, boundary=boundary)
-
- headers = {"Content-Type": m.content_type}
- url = f"{base}/voice/conversion"
-
- res = requests.post(url=url, data=m, headers=headers)
-
- fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
- path = f"{abs_path}/{fname}"
-
- with open(path, "wb") as f:
- f.write(res.content)
- print(path)
- return path
-
-
-def voice_ssml(ssml):
- fields = {
- "ssml": ssml,
- }
- boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
-
- m = MultipartEncoder(fields=fields, boundary=boundary)
- headers = {"Content-Type": m.content_type}
- url = f"{base}/voice/ssml"
-
- res = requests.post(url=url, data=m, headers=headers)
- fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
- path = f"{abs_path}/{fname}"
-
- with open(path, "wb") as f:
- f.write(res.content)
- print(path)
- return path
-
-def voice_dimensional_emotion(upload_path):
- upload_name = os.path.basename(upload_path)
- upload_type = f'audio/{upload_name.split(".")[1]}' # wav,ogg
-
- with open(upload_path, 'rb') as upload_file:
- fields = {
- "upload": (upload_name, upload_file, upload_type),
- }
- boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
-
- m = MultipartEncoder(fields=fields, boundary=boundary)
- headers = {"Content-Type": m.content_type}
- url = f"{base}/voice/dimension-emotion"
-
- res = requests.post(url=url, data=m, headers=headers)
- fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
- path = f"{abs_path}/{fname}"
-
- with open(path, "wb") as f:
- f.write(res.content)
- print(path)
- return path
-```
+- See `api_test.py`
## API KEY
@@ -614,4 +432,5 @@ Learning and communication,now there is only Chinese [QQ group](https://qm.qq.co
- emotional-vits:https://github.com/innnky/emotional-vits
- vits-uma-genshin-honkai:https://huggingface.co/spaces/zomehwh/vits-uma-genshin-honkai
- vits_chinese:https://github.com/PlayVoice/vits_chinese
+- Bert_VITS2:https://github.com/fishaudio/Bert-VITS2
diff --git a/README_zh.md b/README_zh.md
index 80f5143..cbe5f40 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -5,7 +5,7 @@
-
+
@@ -16,6 +16,7 @@
+
# Feature
- [x] VITS语音合成,语音转换
@@ -48,7 +49,7 @@ https://user-images.githubusercontent.com/73542220/237995061-c1f25b4e-dd86-438a-
# 部署
-## Docker部署
+## Docker部署(Linux推荐)
### 镜像拉取脚本
@@ -137,17 +138,15 @@ DIMENSIONAL_EMOTION_MODEL = ABS_PATH + "/Model/model.yaml"
### 下载python依赖
-推荐使用python的虚拟环境,python版本 >= 3.9
+推荐使用python的虚拟环境
`pip install -r requirements.txt`
windows下可能安装不了fasttext,可以用以下命令安装,附[wheels下载地址](https://www.lfd.uci.edu/~gohlke/pythonlibs/#fasttext)
```
-#python3.10 win_amd64
+# python3.10 win_amd64
pip install https://github.com/Artrajz/archived/raw/main/fasttext/fasttext-0.9.2-cp310-cp310-win_amd64.whl
-#python3.9 win_amd64
-pip install https://github.com/Artrajz/archived/raw/main/fasttext/fasttext-0.9.2-cp39-cp39-win_amd64.whl
```
### 下载VITS模型
@@ -222,14 +221,12 @@ nvidia-smi
### 安装GPU版pytorch
-CUDA11.7对应的pytorch是用这个命令安装
+CUDA11.7对应的pytorch是用这个命令安装,推荐使用1.13.1+cu117,其他版本可能存在内存不稳定的问题。
```
-pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
+pip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117
```
-对应版本的命令可以在[官网](https://pytorch.org/get-started/locally/)找到
-
## Linux
安装过程类似,但我没有相应的环境所以没办法测试
@@ -272,190 +269,9 @@ pip install pyopenjtalk -i https://pypi.artrajz.cn/simple
## POST
-- python
-
-```python
-import re
-import requests
-import os
-import random
-import string
-from requests_toolbelt.multipart.encoder import MultipartEncoder
-
-abs_path = os.path.dirname(__file__)
-base = "http://127.0.0.1:23456"
-
-
-# 映射表
-def voice_speakers():
- url = f"{base}/voice/speakers"
-
- res = requests.post(url=url)
- json = res.json()
- for i in json:
- print(i)
- for j in json[i]:
- print(j)
- return json
-
-
-# 语音合成 voice vits
-def voice_vits(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, max=50):
- fields = {
- "text": text,
- "id": str(id),
- "format": format,
- "lang": lang,
- "length": str(length),
- "noise": str(noise),
- "noisew": str(noisew),
- "max": str(max)
- }
- boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
-
- m = MultipartEncoder(fields=fields, boundary=boundary)
- headers = {"Content-Type": m.content_type}
- url = f"{base}/voice"
-
- res = requests.post(url=url, data=m, headers=headers)
- fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
- path = f"{abs_path}/{fname}"
-
- with open(path, "wb") as f:
- f.write(res.content)
- print(path)
- return path
-
-
-# 语音转换 hubert-vits
-def voice_hubert_vits(upload_path, id, format="wav", length=1, noise=0.667, noisew=0.8):
- upload_name = os.path.basename(upload_path)
- upload_type = f'audio/{upload_name.split(".")[1]}' # wav,ogg
-
- with open(upload_path, 'rb') as upload_file:
- fields = {
- "upload": (upload_name, upload_file, upload_type),
- "id": str(id),
- "format": format,
- "length": str(length),
- "noise": str(noise),
- "noisew": str(noisew),
- }
- boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
-
- m = MultipartEncoder(fields=fields, boundary=boundary)
- headers = {"Content-Type": m.content_type}
- url = f"{base}/voice/hubert-vits"
-
- res = requests.post(url=url, data=m, headers=headers)
- fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
- path = f"{abs_path}/{fname}"
-
- with open(path, "wb") as f:
- f.write(res.content)
- print(path)
- return path
-
-
-# 维度情感模型 w2v2-vits
-def voice_w2v2_vits(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, max=50, emotion=0):
- fields = {
- "text": text,
- "id": str(id),
- "format": format,
- "lang": lang,
- "length": str(length),
- "noise": str(noise),
- "noisew": str(noisew),
- "max": str(max),
- "emotion": str(emotion)
- }
- boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
-
- m = MultipartEncoder(fields=fields, boundary=boundary)
- headers = {"Content-Type": m.content_type}
- url = f"{base}/voice/w2v2-vits"
-
- res = requests.post(url=url, data=m, headers=headers)
- fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
- path = f"{abs_path}/{fname}"
-
- with open(path, "wb") as f:
- f.write(res.content)
- print(path)
- return path
-
-
-# 语音转换 同VITS模型内角色之间的音色转换
-def voice_conversion(upload_path, original_id, target_id):
- upload_name = os.path.basename(upload_path)
- upload_type = f'audio/{upload_name.split(".")[1]}' # wav,ogg
-
- with open(upload_path, 'rb') as upload_file:
- fields = {
- "upload": (upload_name, upload_file, upload_type),
- "original_id": str(original_id),
- "target_id": str(target_id),
- }
- boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
- m = MultipartEncoder(fields=fields, boundary=boundary)
-
- headers = {"Content-Type": m.content_type}
- url = f"{base}/voice/conversion"
-
- res = requests.post(url=url, data=m, headers=headers)
-
- fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
- path = f"{abs_path}/{fname}"
-
- with open(path, "wb") as f:
- f.write(res.content)
- print(path)
- return path
-
-
-def voice_ssml(ssml):
- fields = {
- "ssml": ssml,
- }
- boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
-
- m = MultipartEncoder(fields=fields, boundary=boundary)
- headers = {"Content-Type": m.content_type}
- url = f"{base}/voice/ssml"
-
- res = requests.post(url=url, data=m, headers=headers)
- fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
- path = f"{abs_path}/{fname}"
-
- with open(path, "wb") as f:
- f.write(res.content)
- print(path)
- return path
-
-def voice_dimensional_emotion(upload_path):
- upload_name = os.path.basename(upload_path)
- upload_type = f'audio/{upload_name.split(".")[1]}' # wav,ogg
-
- with open(upload_path, 'rb') as upload_file:
- fields = {
- "upload": (upload_name, upload_file, upload_type),
- }
- boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
-
- m = MultipartEncoder(fields=fields, boundary=boundary)
- headers = {"Content-Type": m.content_type}
- url = f"{base}/voice/dimension-emotion"
-
- res = requests.post(url=url, data=m, headers=headers)
- fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
- path = f"{abs_path}/{fname}"
-
- with open(path, "wb") as f:
- f.write(res.content)
- print(path)
- return path
-```
+- 见`api_test.py`
+
+
## API KEY
@@ -613,4 +429,5 @@ def voice_dimensional_emotion(upload_path):
- emotional-vits:https://github.com/innnky/emotional-vits
- vits-uma-genshin-honkai:https://huggingface.co/spaces/zomehwh/vits-uma-genshin-honkai
- vits_chinese:https://github.com/PlayVoice/vits_chinese
+- Bert_VITS2:https://github.com/fishaudio/Bert-VITS2