mgelard commited on
Commit
f5968bc
·
verified ·
1 Parent(s): acefdf1

Upload tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.py +1 -0
  2. tokenizer_config.json +10 -11
tokenizer.py CHANGED
@@ -79,6 +79,7 @@ class BinnedOmicTokenizer(PreTrainedTokenizer):
79
  gene_expr = gene_expr / self.normalization_factor
80
 
81
  token_ids = np.digitize(gene_expr, self.bin_edges).astype(int)
 
82
  token_ids[gene_expr == 0.0] = 0
83
 
84
  if self.prepend_cls_token:
 
79
  gene_expr = gene_expr / self.normalization_factor
80
 
81
  token_ids = np.digitize(gene_expr, self.bin_edges).astype(int)
82
+ token_ids = np.clip(token_ids, 0, self.n_expressions_bins - 1)
83
  token_ids[gene_expr == 0.0] = 0
84
 
85
  if self.prepend_cls_token:
tokenizer_config.json CHANGED
@@ -1,17 +1,16 @@
1
  {
2
- "tokenizer_class": "BinnedOmicTokenizer",
3
- "n_expressions_bins": 64,
4
- "min_omic_value": 0.0,
5
- "max_omic_value": 1.0,
6
- "use_max_normalization": true,
7
- "normalization_factor": 5.547176906585117,
8
- "prepend_cls_token": false,
9
- "fixed_sequence_length": null,
10
- "unpadded_length": null,
11
  "auto_map": {
12
  "AutoTokenizer": [
13
  "tokenizer.BinnedOmicTokenizer",
14
  null
15
  ]
16
- }
17
- }
 
 
 
 
 
 
 
 
1
  {
2
+ "added_tokens_decoder": {},
 
 
 
 
 
 
 
 
3
  "auto_map": {
4
  "AutoTokenizer": [
5
  "tokenizer.BinnedOmicTokenizer",
6
  null
7
  ]
8
+ },
9
+ "clean_up_tokenization_spaces": false,
10
+ "cls_token": "<cls>",
11
+ "extra_special_tokens": {},
12
+ "mask_token": "<mask>",
13
+ "model_max_length": 1000000000000000019884624838656,
14
+ "pad_token": "<pad>",
15
+ "tokenizer_class": "BinnedOmicTokenizer"
16
+ }