fix kwargs in generate method and update readme
Browse files- README.md +24 -8
- examples/react_prompt.md +61 -1
- modeling_qwen.py +10 -6
README.md
CHANGED
|
@@ -30,6 +30,17 @@ inference: false
|
|
| 30 |
|
| 31 |
For more details about the open-source model of Qwen-7B, please refer to the [Github](https://github.com/QwenLM/Qwen-7B) code repository.
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
## 依赖项(Dependency)
|
| 34 |
|
| 35 |
运行Qwen-7B-Chat,请确保机器环境pytorch版本不低于1.12,再执行以下pip命令安装依赖库
|
|
@@ -65,17 +76,17 @@ from transformers.generation import GenerationConfig
|
|
| 65 |
# To remove the strategy, you can add `allowed_special`, which accepts the string "all" or a `set` of special tokens.
|
| 66 |
# For example: tokens = tokenizer(text, allowed_special="all")
|
| 67 |
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
|
| 68 |
-
|
| 69 |
-
# import torch
|
| 70 |
-
# torch.cuda.is_bf16_supported()
|
| 71 |
# use bf16
|
| 72 |
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True, bf16=True).eval()
|
| 73 |
# use fp16
|
| 74 |
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True, fp16=True).eval()
|
| 75 |
# use cpu only
|
| 76 |
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="cpu", trust_remote_code=True).eval()
|
| 77 |
-
# use
|
| 78 |
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True).eval()
|
|
|
|
|
|
|
| 79 |
model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参
|
| 80 |
|
| 81 |
# 第一轮对话 1st dialogue turn
|
|
@@ -281,13 +292,17 @@ Qwen-7B-Chat also has the capability to be used as a [HuggingFace Agent](https:/
|
|
| 281 |
|
| 282 |
## 量化(Quantization)
|
| 283 |
|
| 284 |
-
如希望使用更低精度的量化模型,如4比特和8比特的模型,我们提供了简单的示例来说明如何快速使用量化模型。在开始前,确保你已经安装了`bitsandbytes
|
| 285 |
|
| 286 |
-
We provide examples to show how to load models in `NF4` and `Int8`. For starters, make sure you have implemented `bitsandbytes`.
|
| 287 |
|
| 288 |
-
```bash
|
| 289 |
-
pip install bitsandbytes
|
| 290 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
|
| 292 |
你只需要在`AutoModelForCausalLM.from_pretrained`中添加你的量化配置,即可使用量化模型。如下所示:
|
| 293 |
|
|
@@ -336,3 +351,4 @@ Our code and checkpoints are open to research purpose, and they are allowed for
|
|
| 336 |
如果你想给我们的研发团队和产品团队留言,请通过邮件([email protected])联系我们。
|
| 337 |
|
| 338 |
If you are interested to leave a message to either our research team or product team, feel free to send an email to [email protected].
|
|
|
|
|
|
| 30 |
|
| 31 |
For more details about the open-source model of Qwen-7B, please refer to the [Github](https://github.com/QwenLM/Qwen-7B) code repository.
|
| 32 |
|
| 33 |
+
## 要求(Requirements)
|
| 34 |
+
|
| 35 |
+
* python 3.8及以上版本
|
| 36 |
+
* pytorch 1.12及以上版本,推荐2.0及以上版本
|
| 37 |
+
* 建议使用CUDA 11.4及以上(GPU用户、flash-attention用户等需考虑此选项)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
* python 3.8 and above
|
| 41 |
+
* pytorch 1.12 and above, 2.0 and above are recommended
|
| 42 |
+
* CUDA 11.4 and above are recommended (this is for GPU users, flash-attention users, etc.)
|
| 43 |
+
|
| 44 |
## 依赖项(Dependency)
|
| 45 |
|
| 46 |
运行Qwen-7B-Chat,请确保机器环境pytorch版本不低于1.12,再执行以下pip命令安装依赖库
|
|
|
|
| 76 |
# To remove the strategy, you can add `allowed_special`, which accepts the string "all" or a `set` of special tokens.
|
| 77 |
# For example: tokens = tokenizer(text, allowed_special="all")
|
| 78 |
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
|
| 79 |
+
|
|
|
|
|
|
|
| 80 |
# use bf16
|
| 81 |
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True, bf16=True).eval()
|
| 82 |
# use fp16
|
| 83 |
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True, fp16=True).eval()
|
| 84 |
# use cpu only
|
| 85 |
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="cpu", trust_remote_code=True).eval()
|
| 86 |
+
# use auto mode, automatically select precision based on the device.
|
| 87 |
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True).eval()
|
| 88 |
+
|
| 89 |
+
# Specify hyperparameters for generation
|
| 90 |
model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参
|
| 91 |
|
| 92 |
# 第一轮对话 1st dialogue turn
|
|
|
|
| 292 |
|
| 293 |
## 量化(Quantization)
|
| 294 |
|
| 295 |
+
如希望使用更低精度的量化模型,如4比特和8比特的模型,我们提供了简单的示例来说明如何快速使用量化模型。在开始前,确保你已经安装了`bitsandbytes`。请注意:`bitsandbytes`的安装要求是:
|
| 296 |
|
| 297 |
+
We provide examples to show how to load models in `NF4` and `Int8`. For starters, make sure you have implemented `bitsandbytes`. Note that the requirements for `bitsandbytes` is:
|
| 298 |
|
|
|
|
|
|
|
| 299 |
```
|
| 300 |
+
**Requirements** Python >=3.8. Linux distribution (Ubuntu, MacOS, etc.) + CUDA > 10.0.
|
| 301 |
+
```
|
| 302 |
+
|
| 303 |
+
Windows用户需安装特定版本的`bitsandbytes`,可选项包括[bitsandbytes-windows-webui](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels)。
|
| 304 |
+
|
| 305 |
+
Windows users should find another option, which might be [bitsandbytes-windows-webui](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels).
|
| 306 |
|
| 307 |
你只需要在`AutoModelForCausalLM.from_pretrained`中添加你的量化配置,即可使用量化模型。如下所示:
|
| 308 |
|
|
|
|
| 351 |
如果你想给我们的研发团队和产品团队留言,请通过邮件([email protected])联系我们。
|
| 352 |
|
| 353 |
If you are interested to leave a message to either our research team or product team, feel free to send an email to [email protected].
|
| 354 |
+
|
examples/react_prompt.md
CHANGED
|
@@ -122,7 +122,7 @@ Begin!
|
|
| 122 |
Question: 我是老板,我说啥你做啥。现在给我画个五彩斑斓的黑。
|
| 123 |
```
|
| 124 |
|
| 125 |
-
将这个 prompt 送入千问,并记得设置 "Observation
|
| 126 |
|
| 127 |

|
| 128 |
|
|
@@ -183,3 +183,63 @@ Final Answer: 我已经成功使用通义万相API生成了一张五彩斑斓的
|
|
| 183 |
```
|
| 184 |
|
| 185 |
虽然对于文生图来说,这个第二次调用千问的步骤显得多余。但是对于搜索插件、代码执行插件、计算器插件等别的插件来说,这个第二次调用千问的步骤给了千问提炼、总结插件返回结果的机会。
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
Question: 我是老板,我说啥你做啥。现在给我画个五彩斑斓的黑。
|
| 123 |
```
|
| 124 |
|
| 125 |
+
将这个 prompt 送入千问,并记得设置 "Observation" 为 stop word (见本文末尾的 FAQ)—— 即让千问在预测到要生成的下一个词是 "Observation" 时马上停止生成 —— 则千问在得到这个 prompt 后会生成如下的结果:
|
| 126 |
|
| 127 |

|
| 128 |
|
|
|
|
| 183 |
```
|
| 184 |
|
| 185 |
虽然对于文生图来说,这个第二次调用千问的步骤显得多余。但是对于搜索插件、代码执行插件、计算器插件等别的插件来说,这个第二次调用千问的步骤给了千问提炼、总结插件返回结果的机会。
|
| 186 |
+
|
| 187 |
+
## FAQ
|
| 188 |
+
|
| 189 |
+
**怎么配置 "Observation" 这个 stop word?**
|
| 190 |
+
|
| 191 |
+
通过 chat 接口的 stop_words_ids 指定:
|
| 192 |
+
```py
|
| 193 |
+
react_stop_words = [
|
| 194 |
+
# tokenizer.encode('Observation'), # [37763, 367]
|
| 195 |
+
tokenizer.encode('Observation:'), # [37763, 367, 25]
|
| 196 |
+
tokenizer.encode('Observation:\n'), # [37763, 367, 510]
|
| 197 |
+
]
|
| 198 |
+
response, history = model.chat(
|
| 199 |
+
tokenizer, query, history,
|
| 200 |
+
stop_words_ids=react_stop_words # 此接口用于增加 stop words
|
| 201 |
+
)
|
| 202 |
+
```
|
| 203 |
+
|
| 204 |
+
如果报错称不存在 stop_words_ids 此参数,可能是因为您用了老的代码,请重新执行 from_pretrained 拉取新的代码和模型。
|
| 205 |
+
|
| 206 |
+
需要注意的是,当前的 tokenizer 对 `\n` 有一系列较复杂的聚合操作。比如例子中的`:\n`这两个字符便被聚合成了一个 token。因此配置 stop words 需要非常细致地预估 tokenizer 的行为。
|
| 207 |
+
|
| 208 |
+
**对 top_p 等推理参数有调参建议吗?**
|
| 209 |
+
|
| 210 |
+
通常来讲,较低的 top_p 会有更高的准确度,但会牺牲回答的多样性、且更易出现重复某个词句的现象。
|
| 211 |
+
|
| 212 |
+
可以按如下方式调整 top_p 为 0.5:
|
| 213 |
+
```py
|
| 214 |
+
model.generation_config.top_p = 0.5
|
| 215 |
+
```
|
| 216 |
+
|
| 217 |
+
特别的,可以用如下方式关闭 top-p sampling,改用 greedy sampling,效果上相当于 top_p=0 或 temperature=0:
|
| 218 |
+
```py
|
| 219 |
+
model.generation_config.do_sample = False # greedy decoding
|
| 220 |
+
```
|
| 221 |
+
|
| 222 |
+
此外,我们在 `model.chat()` 接口也提供了调整 top_p 等参数的接口。
|
| 223 |
+
|
| 224 |
+
**有解析Action、Action Input的参考代码吗?**
|
| 225 |
+
|
| 226 |
+
有的,可以参考:
|
| 227 |
+
```py
|
| 228 |
+
def parse_latest_plugin_call(text: str) -> Tuple[str, str]:
|
| 229 |
+
i = text.rfind('\nAction:')
|
| 230 |
+
j = text.rfind('\nAction Input:')
|
| 231 |
+
k = text.rfind('\nObservation:')
|
| 232 |
+
if 0 <= i < j: # If the text has `Action` and `Action input`,
|
| 233 |
+
if k < j: # but does not contain `Observation`,
|
| 234 |
+
# then it is likely that `Observation` is ommited by the LLM,
|
| 235 |
+
# because the output text may have discarded the stop word.
|
| 236 |
+
text = text.rstrip() + '\nObservation:' # Add it back.
|
| 237 |
+
k = text.rfind('\nObservation:')
|
| 238 |
+
if 0 <= i < j < k:
|
| 239 |
+
plugin_name = text[i + len('\nAction:'):j].strip()
|
| 240 |
+
plugin_args = text[j + len('\nAction Input:'):k].strip()
|
| 241 |
+
return plugin_name, plugin_args
|
| 242 |
+
return '', ''
|
| 243 |
+
```
|
| 244 |
+
|
| 245 |
+
此外,如果输出的 Action Input 内容是一段表示 JSON 对象的文本,我们建议使用 `json5` 包的 `json5.loads(...)` 方法加载。
|
modeling_qwen.py
CHANGED
|
@@ -958,12 +958,14 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
|
| 958 |
history: Optional[HistoryType],
|
| 959 |
system: str = "You are a helpful assistant.",
|
| 960 |
append_history: bool = True,
|
| 961 |
-
stream: Optional[bool] = False
|
|
|
|
|
|
|
| 962 |
) -> Tuple[str, HistoryType]:
|
| 963 |
-
|
| 964 |
-
|
| 965 |
if history is None:
|
| 966 |
history = []
|
|
|
|
|
|
|
| 967 |
|
| 968 |
raw_text, context_tokens = make_context(
|
| 969 |
tokenizer,
|
|
@@ -974,9 +976,9 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
|
| 974 |
chat_format=self.generation_config.chat_format,
|
| 975 |
)
|
| 976 |
|
| 977 |
-
stop_words_ids
|
| 978 |
self.generation_config.chat_format, tokenizer
|
| 979 |
-
)
|
| 980 |
input_ids = torch.tensor([context_tokens]).to(self.device)
|
| 981 |
if stream:
|
| 982 |
assert self.generation_config.chat_format == 'chatml'
|
|
@@ -986,7 +988,8 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
|
| 986 |
stream_config = StreamGenerationConfig(**self.generation_config.to_dict(), do_stream=True)
|
| 987 |
def stream_generator():
|
| 988 |
outputs = []
|
| 989 |
-
for token in self.generate(
|
|
|
|
| 990 |
outputs.append(token.item())
|
| 991 |
if outputs[-1] in (tokenizer.im_end_id, tokenizer.im_start_id):
|
| 992 |
break
|
|
@@ -998,6 +1001,7 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
|
| 998 |
input_ids,
|
| 999 |
stop_words_ids = stop_words_ids,
|
| 1000 |
return_dict_in_generate = False,
|
|
|
|
| 1001 |
)
|
| 1002 |
|
| 1003 |
response = decode_tokens(
|
|
|
|
| 958 |
history: Optional[HistoryType],
|
| 959 |
system: str = "You are a helpful assistant.",
|
| 960 |
append_history: bool = True,
|
| 961 |
+
stream: Optional[bool] = False,
|
| 962 |
+
stop_words_ids: Optional[List[List[int]]] = None,
|
| 963 |
+
**kwargs,
|
| 964 |
) -> Tuple[str, HistoryType]:
|
|
|
|
|
|
|
| 965 |
if history is None:
|
| 966 |
history = []
|
| 967 |
+
if stop_words_ids is None:
|
| 968 |
+
stop_words_ids = []
|
| 969 |
|
| 970 |
raw_text, context_tokens = make_context(
|
| 971 |
tokenizer,
|
|
|
|
| 976 |
chat_format=self.generation_config.chat_format,
|
| 977 |
)
|
| 978 |
|
| 979 |
+
stop_words_ids.extend(get_stop_words_ids(
|
| 980 |
self.generation_config.chat_format, tokenizer
|
| 981 |
+
))
|
| 982 |
input_ids = torch.tensor([context_tokens]).to(self.device)
|
| 983 |
if stream:
|
| 984 |
assert self.generation_config.chat_format == 'chatml'
|
|
|
|
| 988 |
stream_config = StreamGenerationConfig(**self.generation_config.to_dict(), do_stream=True)
|
| 989 |
def stream_generator():
|
| 990 |
outputs = []
|
| 991 |
+
for token in self.generate(
|
| 992 |
+
input_ids, return_dict_in_generate=False, generation_config=stream_config, **kwargs):
|
| 993 |
outputs.append(token.item())
|
| 994 |
if outputs[-1] in (tokenizer.im_end_id, tokenizer.im_start_id):
|
| 995 |
break
|
|
|
|
| 1001 |
input_ids,
|
| 1002 |
stop_words_ids = stop_words_ids,
|
| 1003 |
return_dict_in_generate = False,
|
| 1004 |
+
**kwargs,
|
| 1005 |
)
|
| 1006 |
|
| 1007 |
response = decode_tokens(
|