Simon-Liu commited on
Commit
12352d8
·
verified ·
1 Parent(s): a3a329b

Upload tokenizer

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<end_of_image>": 262145,
3
+ "<image_soft_token>": 262144
4
+ }
chat_template.jinja ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ {#- Begin-of-sequence token to start the model prompt -#}
3
+ {{ bos_token }}
4
+ {#- Extracts the system message. Gemma does not support system messages so it will be prepended to first user message. -#}
5
+ {%- if messages[0]['role'] == 'system' -%}
6
+ {%- if messages[0]['content'] is string -%}
7
+ {%- set first_user_prefix = messages[0]['content'] + '
8
+
9
+ ' -%}
10
+ {%- else -%}
11
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
12
+
13
+ ' -%}
14
+ {%- endif -%}
15
+ {%- set loop_messages = messages[1:] -%}
16
+ {%- else -%}
17
+ {%- set first_user_prefix = "" -%}
18
+ {%- set loop_messages = messages -%}
19
+ {%- endif -%}
20
+ {#- Set tools to none if not defined for this ChatCompletion request (helps avoid errors later) -#}
21
+ {%- if not tools is defined %}
22
+ {%- set tools = none %}
23
+ {%- endif %}
24
+ {#- Validate alternating user/assistant messages (excluding 'tool' messages and ones with tool_calls) -#}
25
+ {%- for message in loop_messages | rejectattr("role", "equalto", "tool") | selectattr("tool_calls", "undefined") -%}
26
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
27
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
28
+ {%- endif -%}
29
+ {%- endfor -%}
30
+
31
+ {#- Main loop over all messages in the conversation history -#}
32
+ {%- for message in loop_messages -%}
33
+ {#- Normalize roles for model prompt formatting -#}
34
+ {%- if (message['role'] == 'assistant') -%}
35
+ {%- set role = "model" -%}
36
+ {%- elif (message['role'] == 'tool') -%}
37
+ {%- set role = "user" -%}
38
+ {%- else -%}
39
+ {%- set role = message['role'] -%}
40
+ {%- endif -%}
41
+ {#- Mark the start of a message block with the appropriate role -#}
42
+ {{ '<start_of_turn>' + role + '
43
+ ' -}}
44
+
45
+ {#- Insert system message content (if present) at the beginning of the first message. -#}
46
+ {%- if loop.first -%}
47
+ {{ first_user_prefix }}
48
+ {#- Append system message with tool information if using tools in message request. -#}
49
+ {%- if tools is not none -%}
50
+ {{- "Tools (functions) are available. If you decide to invoke one or more of the tools, you must respond with a python list of the function calls.
51
+ " -}}
52
+ {{- "Example Format: [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]
53
+ " -}}
54
+ {{- "Do not use variables. DO NOT USE MARKDOWN SYNTAX. You SHOULD NOT include any other text in the response if you call a function. If none of the functions can be used, point it out. If you lack the parameters required by the function, also point it out.
55
+ " -}}
56
+ {{- "Here is a list of functions in JSON format that you can invoke.
57
+ " -}}
58
+ {{- tools | tojson(indent=4) -}}
59
+ {{- "
60
+
61
+ " -}}
62
+ {%- endif -%}
63
+ {%- endif -%}
64
+
65
+ {#- Format model tool calls (turns where model indicates they want to call a tool) -#}
66
+ {#- FIX: Check if message.tool_calls is truthy (not None) instead of just checking if key exists -#}
67
+ {%- if message.tool_calls -%}
68
+ {#- Opening bracket for tool call list. -#}
69
+ {{- '[' -}}
70
+ {#- For each tool call -#}
71
+ {%- for tool_call in message.tool_calls -%}
72
+ {#- Get tool call function. -#}
73
+ {%- if tool_call.function is defined -%}
74
+ {%- set tool_call = tool_call.function -%}
75
+ {%- endif -%}
76
+ {#- Function name & opening parenthesis. -#}
77
+ {{- tool_call.name + '(' -}}
78
+
79
+ {#-- Handle arguments as list (positional) or dict (named) --#}
80
+ {#-- Named arguments (dict) --#}
81
+ {%- if tool_call.arguments is mapping -%}
82
+ {%- set first = true -%}
83
+ {%- for key, val in tool_call.arguments.items() -%}
84
+ {%- if not first %}, {% endif -%}
85
+ {{ key }}={{ val | tojson }}
86
+ {%- set first = false -%}
87
+ {%- endfor -%}
88
+ {#-- String arguments (JSON string) - Handle specifically to avoid treating string as iterable chars --#}
89
+ {%- elif tool_call.arguments is string -%}
90
+ {#- Warning: Cannot parse JSON string inside Jinja. Outputting raw string or you need to pre-process data. -#}
91
+ {{- tool_call.arguments -}}
92
+ {#-- Positional arguments (list) --#}
93
+ {%- elif tool_call.arguments is iterable -%}
94
+ {{- tool_call.arguments | map('tojson') | join(', ') -}}
95
+ {#-- Fallback: single positional value --#}
96
+ {%- else -%}
97
+ {{- tool_call.arguments | tojson -}}
98
+ {#-- Closing parenthesis. --#}
99
+ {%- endif -%}
100
+ {{- ')' -}}
101
+ {#-- If more than one tool call, place comma and move to formatting next tool call --#}
102
+ {%- if not loop.last -%}, {% endif -%}
103
+ {%- endfor -%}
104
+ {#- Closing bracket for tool call list. -#}
105
+ {{- ']' -}}
106
+ {%- endif -%}
107
+
108
+ {#- Tool response start tag (for messages from a tool) -#}
109
+ {%- if (message['role'] == 'tool') -%}
110
+ {{ '<tool_response>
111
+ ' -}}
112
+ {%- endif -%}
113
+
114
+ {#- Render the message content -#}
115
+ {%- if message['content'] is string -%}
116
+ {{ message['content'] | trim }}
117
+ {%- elif message['content'] is iterable -%}
118
+ {%- for item in message['content'] -%}
119
+ {%- if item['type'] == 'image' -%}
120
+ {{ 'PickleImage' }}
121
+ {%- elif item['type'] == 'text' -%}
122
+ {{ item['text'] | trim }}
123
+ {%- endif -%}
124
+ {%- endfor -%}
125
+ {%- endif -%}
126
+
127
+ {#- Tool response end tag -#}
128
+ {%- if (message['role'] == 'tool') -%}
129
+ {{ '</tool_response>' -}}
130
+ {%- endif -%}
131
+
132
+ {#- Mark end of a single turn -#}
133
+ {{ '<end_of_turn>
134
+ ' }}
135
+ {%- endfor -%}
136
+
137
+ {#- If generation is to be triggered, add model prompt prefix -#}
138
+ {%- if add_generation_prompt -%}
139
+ {{'<start_of_turn>model
140
+ '}}
141
+ {%- endif -%}
special_tokens_map.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<eos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "sfr_token": "<start_function_response>",
27
+ "unk_token": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6b09a0b4a803ad453063ca4bb49a784540e8120004e2450e025df2b27d41fb2
3
+ size 33384899
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa009fcbc3589a9904d30d04834094fea4653c2ac6d2de2cd1262d4f7a50ceb3
3
+ size 4689144
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff