src.data
1from functools import partial 2 3import pandas as pd 4import streamlit as st 5import torch 6from datasets import Dataset, DatasetDict, load_dataset # type: ignore 7from torch.nn.functional import cross_entropy 8from transformers import DataCollatorForTokenClassification # type: ignore 9 10from src.utils import device, tokenizer_hash_funcs 11 12 13@st.cache(allow_output_mutation=True) 14def get_data(ds_name: str, config_name: str, split_name: str, split_sample_size: int) -> Dataset: 15 """Loads a Dataset from the HuggingFace hub (if not already loaded). 16 17 Uses `datasets.load_dataset` to load the dataset (see its documentation for additional details). 18 19 Args: 20 ds_name (str): Path or name of the dataset. 21 config_name (str): Name of the dataset configuration. 22 split_name (str): Which split of the data to load. 23 split_sample_size (int): The number of examples to load from the split. 24 25 Returns: 26 Dataset: A Dataset object. 27 """ 28 ds: DatasetDict = load_dataset(ds_name, name=config_name, use_auth_token=True).shuffle(seed=0) # type: ignore 29 split = ds[split_name].select(range(split_sample_size)) 30 return split 31 32 33@st.cache( 34 allow_output_mutation=True, 35 hash_funcs=tokenizer_hash_funcs, 36) 37def get_collator(tokenizer) -> DataCollatorForTokenClassification: 38 """Returns a DataCollator that will dynamically pad the inputs received, as well as the labels. 39 40 Args: 41 tokenizer ([PreTrainedTokenizer] or [PreTrainedTokenizerFast]): The tokenizer used for encoding the data. 42 43 Returns: 44 DataCollatorForTokenClassification: The DataCollatorForTokenClassification object. 45 """ 46 return DataCollatorForTokenClassification(tokenizer) 47 48 49def create_word_ids_from_input_ids(tokenizer, input_ids: list[int]) -> list[int]: 50 """Takes a list of input_ids and return corresponding word_ids 51 52 Args: 53 tokenizer: The tokenizer that was used to obtain the input ids. 54 input_ids (list[int]): List of token ids. 55 56 Returns: 57 list[int]: Word ids corresponding to the input ids. 58 """ 59 word_ids = [] 60 wid = -1 61 tokens = [tokenizer.convert_ids_to_tokens(i) for i in input_ids] 62 63 for i, tok in enumerate(tokens): 64 if tok in tokenizer.all_special_tokens: 65 word_ids.append(-1) 66 continue 67 68 if not tokens[i - 1].endswith("@@") and tokens[i - 1] != "<unk>": 69 wid += 1 70 71 word_ids.append(wid) 72 73 assert len(word_ids) == len(input_ids) 74 return word_ids 75 76 77def tokenize(batch, tokenizer) -> dict: 78 """Tokenizes a batch of examples. 79 80 Args: 81 batch: The examples to tokenize 82 tokenizer: The tokenizer to use 83 84 Returns: 85 dict: The tokenized batch 86 """ 87 tokenized_inputs = tokenizer(batch["tokens"], truncation=True, is_split_into_words=True) 88 labels = [] 89 wids = [] 90 91 for idx, label in enumerate(batch["ner_tags"]): 92 try: 93 word_ids = tokenized_inputs.word_ids(batch_index=idx) 94 except ValueError: 95 word_ids = create_word_ids_from_input_ids( 96 tokenizer, tokenized_inputs["input_ids"][idx] 97 ) 98 previous_word_idx = None 99 label_ids = [] 100 for word_idx in word_ids: 101 if word_idx == -1 or word_idx is None or word_idx == previous_word_idx: 102 label_ids.append(-100) 103 else: 104 label_ids.append(label[word_idx]) 105 previous_word_idx = word_idx 106 wids.append(word_ids) 107 labels.append(label_ids) 108 tokenized_inputs["word_ids"] = wids 109 tokenized_inputs["labels"] = labels 110 return tokenized_inputs 111 112 113def stringify_ner_tags(batch: dict, tags) -> dict: 114 """Stringifies a dataset batch's NER tags.""" 115 return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]} 116 117 118def encode_dataset(split: Dataset, tokenizer): 119 """Encodes a dataset split. 120 121 Args: 122 split (Dataset): A Dataset object. 123 tokenizer: A PreTrainedTokenizer object. 124 125 Returns: 126 Dataset: A Dataset object with the encoded inputs. 127 """ 128 129 tags = split.features["ner_tags"].feature 130 split = split.map(partial(stringify_ner_tags, tags=tags), batched=True) 131 remove_columns = split.column_names 132 ids = split["id"] 133 split = split.map( 134 partial(tokenize, tokenizer=tokenizer), 135 batched=True, 136 remove_columns=remove_columns, 137 ) 138 word_ids = [[id if id is not None else -1 for id in wids] for wids in split["word_ids"]] 139 return split.remove_columns(["word_ids"]), word_ids, ids 140 141 142def forward_pass_with_label(batch, model, collator, num_classes: int) -> dict: 143 """Runs the forward pass for a batch of examples. 144 145 Args: 146 batch: The batch to process 147 model: The model to process the batch with 148 collator: A data collator 149 num_classes (int): Number of classes 150 151 Returns: 152 dict: a dictionary containing `losses`, `preds` and `hidden_states` 153 """ 154 155 # Convert dict of lists to list of dicts suitable for data collator 156 features = [dict(zip(batch, t)) for t in zip(*batch.values())] 157 158 # Pad inputs and labels and put all tensors on device 159 batch = collator(features) 160 input_ids = batch["input_ids"].to(device) 161 attention_mask = batch["attention_mask"].to(device) 162 labels = batch["labels"].to(device) 163 164 with torch.no_grad(): 165 # Pass data through model 166 output = model(input_ids, attention_mask, output_hidden_states=True) 167 # logit.size: [batch_size, sequence_length, classes] 168 169 # Predict class with largest logit value on classes axis 170 preds = torch.argmax(output.logits, axis=-1).cpu().numpy() # type: ignore 171 172 # Calculate loss per token after flattening batch dimension with view 173 loss = cross_entropy( 174 output.logits.view(-1, num_classes), labels.view(-1), reduction="none" 175 ) 176 177 # Unflatten batch dimension and convert to numpy array 178 loss = loss.view(len(input_ids), -1).cpu().numpy() 179 hidden_states = output.hidden_states[-1].cpu().numpy() 180 181 # logits = output.logits.view(len(input_ids), -1).cpu().numpy() 182 183 return {"losses": loss, "preds": preds, "hidden_states": hidden_states} 184 185 186def predict(split_encoded: Dataset, model, tokenizer, collator, tags) -> pd.DataFrame: 187 """Generates predictions for a given dataset split and returns the results as a dataframe. 188 189 Args: 190 split_encoded (Dataset): The dataset to process 191 model: The model to process the dataset with 192 tokenizer: The tokenizer to process the dataset with 193 collator: The data collator to use 194 tags: The tags used in the dataset 195 196 Returns: 197 pd.DataFrame: A dataframe containing token-level predictions. 198 """ 199 200 split_encoded = split_encoded.map( 201 partial( 202 forward_pass_with_label, 203 model=model, 204 collator=collator, 205 num_classes=tags.num_classes, 206 ), 207 batched=True, 208 batch_size=8, 209 ) 210 df: pd.DataFrame = split_encoded.to_pandas() # type: ignore 211 212 df["tokens"] = df["input_ids"].apply( 213 lambda x: tokenizer.convert_ids_to_tokens(x) # type: ignore 214 ) 215 df["labels"] = df["labels"].apply( 216 lambda x: ["IGN" if i == -100 else tags.int2str(int(i)) for i in x] 217 ) 218 df["preds"] = df["preds"].apply(lambda x: [model.config.id2label[i] for i in x]) 219 df["preds"] = df.apply(lambda x: x["preds"][: len(x["input_ids"])], axis=1) 220 df["losses"] = df.apply(lambda x: x["losses"][: len(x["input_ids"])], axis=1) 221 df["hidden_states"] = df.apply(lambda x: x["hidden_states"][: len(x["input_ids"])], axis=1) 222 df["total_loss"] = df["losses"].apply(sum) 223 224 return df
14@st.cache(allow_output_mutation=True) 15def get_data(ds_name: str, config_name: str, split_name: str, split_sample_size: int) -> Dataset: 16 """Loads a Dataset from the HuggingFace hub (if not already loaded). 17 18 Uses `datasets.load_dataset` to load the dataset (see its documentation for additional details). 19 20 Args: 21 ds_name (str): Path or name of the dataset. 22 config_name (str): Name of the dataset configuration. 23 split_name (str): Which split of the data to load. 24 split_sample_size (int): The number of examples to load from the split. 25 26 Returns: 27 Dataset: A Dataset object. 28 """ 29 ds: DatasetDict = load_dataset(ds_name, name=config_name, use_auth_token=True).shuffle(seed=0) # type: ignore 30 split = ds[split_name].select(range(split_sample_size)) 31 return split
Loads a Dataset from the HuggingFace hub (if not already loaded).
Uses datasets.load_dataset
to load the dataset (see its documentation for additional details).
Args
- ds_name (str): Path or name of the dataset.
- config_name (str): Name of the dataset configuration.
- split_name (str): Which split of the data to load.
- split_sample_size (int): The number of examples to load from the split.
Returns
Dataset: A Dataset object.
34@st.cache( 35 allow_output_mutation=True, 36 hash_funcs=tokenizer_hash_funcs, 37) 38def get_collator(tokenizer) -> DataCollatorForTokenClassification: 39 """Returns a DataCollator that will dynamically pad the inputs received, as well as the labels. 40 41 Args: 42 tokenizer ([PreTrainedTokenizer] or [PreTrainedTokenizerFast]): The tokenizer used for encoding the data. 43 44 Returns: 45 DataCollatorForTokenClassification: The DataCollatorForTokenClassification object. 46 """ 47 return DataCollatorForTokenClassification(tokenizer)
Returns a DataCollator that will dynamically pad the inputs received, as well as the labels.
Args
- tokenizer ([PreTrainedTokenizer] or [PreTrainedTokenizerFast]): The tokenizer used for encoding the data.
Returns
DataCollatorForTokenClassification: The DataCollatorForTokenClassification object.
50def create_word_ids_from_input_ids(tokenizer, input_ids: list[int]) -> list[int]: 51 """Takes a list of input_ids and return corresponding word_ids 52 53 Args: 54 tokenizer: The tokenizer that was used to obtain the input ids. 55 input_ids (list[int]): List of token ids. 56 57 Returns: 58 list[int]: Word ids corresponding to the input ids. 59 """ 60 word_ids = [] 61 wid = -1 62 tokens = [tokenizer.convert_ids_to_tokens(i) for i in input_ids] 63 64 for i, tok in enumerate(tokens): 65 if tok in tokenizer.all_special_tokens: 66 word_ids.append(-1) 67 continue 68 69 if not tokens[i - 1].endswith("@@") and tokens[i - 1] != "<unk>": 70 wid += 1 71 72 word_ids.append(wid) 73 74 assert len(word_ids) == len(input_ids) 75 return word_ids
Takes a list of input_ids and return corresponding word_ids
Args
- tokenizer: The tokenizer that was used to obtain the input ids.
- input_ids (list[int]): List of token ids.
Returns
list[int]: Word ids corresponding to the input ids.
78def tokenize(batch, tokenizer) -> dict: 79 """Tokenizes a batch of examples. 80 81 Args: 82 batch: The examples to tokenize 83 tokenizer: The tokenizer to use 84 85 Returns: 86 dict: The tokenized batch 87 """ 88 tokenized_inputs = tokenizer(batch["tokens"], truncation=True, is_split_into_words=True) 89 labels = [] 90 wids = [] 91 92 for idx, label in enumerate(batch["ner_tags"]): 93 try: 94 word_ids = tokenized_inputs.word_ids(batch_index=idx) 95 except ValueError: 96 word_ids = create_word_ids_from_input_ids( 97 tokenizer, tokenized_inputs["input_ids"][idx] 98 ) 99 previous_word_idx = None 100 label_ids = [] 101 for word_idx in word_ids: 102 if word_idx == -1 or word_idx is None or word_idx == previous_word_idx: 103 label_ids.append(-100) 104 else: 105 label_ids.append(label[word_idx]) 106 previous_word_idx = word_idx 107 wids.append(word_ids) 108 labels.append(label_ids) 109 tokenized_inputs["word_ids"] = wids 110 tokenized_inputs["labels"] = labels 111 return tokenized_inputs
Tokenizes a batch of examples.
Args
- batch: The examples to tokenize
- tokenizer: The tokenizer to use
Returns
dict: The tokenized batch
119def encode_dataset(split: Dataset, tokenizer): 120 """Encodes a dataset split. 121 122 Args: 123 split (Dataset): A Dataset object. 124 tokenizer: A PreTrainedTokenizer object. 125 126 Returns: 127 Dataset: A Dataset object with the encoded inputs. 128 """ 129 130 tags = split.features["ner_tags"].feature 131 split = split.map(partial(stringify_ner_tags, tags=tags), batched=True) 132 remove_columns = split.column_names 133 ids = split["id"] 134 split = split.map( 135 partial(tokenize, tokenizer=tokenizer), 136 batched=True, 137 remove_columns=remove_columns, 138 ) 139 word_ids = [[id if id is not None else -1 for id in wids] for wids in split["word_ids"]] 140 return split.remove_columns(["word_ids"]), word_ids, ids
Encodes a dataset split.
Args
- split (Dataset): A Dataset object.
- tokenizer: A PreTrainedTokenizer object.
Returns
Dataset: A Dataset object with the encoded inputs.
143def forward_pass_with_label(batch, model, collator, num_classes: int) -> dict: 144 """Runs the forward pass for a batch of examples. 145 146 Args: 147 batch: The batch to process 148 model: The model to process the batch with 149 collator: A data collator 150 num_classes (int): Number of classes 151 152 Returns: 153 dict: a dictionary containing `losses`, `preds` and `hidden_states` 154 """ 155 156 # Convert dict of lists to list of dicts suitable for data collator 157 features = [dict(zip(batch, t)) for t in zip(*batch.values())] 158 159 # Pad inputs and labels and put all tensors on device 160 batch = collator(features) 161 input_ids = batch["input_ids"].to(device) 162 attention_mask = batch["attention_mask"].to(device) 163 labels = batch["labels"].to(device) 164 165 with torch.no_grad(): 166 # Pass data through model 167 output = model(input_ids, attention_mask, output_hidden_states=True) 168 # logit.size: [batch_size, sequence_length, classes] 169 170 # Predict class with largest logit value on classes axis 171 preds = torch.argmax(output.logits, axis=-1).cpu().numpy() # type: ignore 172 173 # Calculate loss per token after flattening batch dimension with view 174 loss = cross_entropy( 175 output.logits.view(-1, num_classes), labels.view(-1), reduction="none" 176 ) 177 178 # Unflatten batch dimension and convert to numpy array 179 loss = loss.view(len(input_ids), -1).cpu().numpy() 180 hidden_states = output.hidden_states[-1].cpu().numpy() 181 182 # logits = output.logits.view(len(input_ids), -1).cpu().numpy() 183 184 return {"losses": loss, "preds": preds, "hidden_states": hidden_states}
Runs the forward pass for a batch of examples.
Args
- batch: The batch to process
- model: The model to process the batch with
- collator: A data collator
- num_classes (int): Number of classes
Returns
dict: a dictionary containing
losses
,preds
andhidden_states
187def predict(split_encoded: Dataset, model, tokenizer, collator, tags) -> pd.DataFrame: 188 """Generates predictions for a given dataset split and returns the results as a dataframe. 189 190 Args: 191 split_encoded (Dataset): The dataset to process 192 model: The model to process the dataset with 193 tokenizer: The tokenizer to process the dataset with 194 collator: The data collator to use 195 tags: The tags used in the dataset 196 197 Returns: 198 pd.DataFrame: A dataframe containing token-level predictions. 199 """ 200 201 split_encoded = split_encoded.map( 202 partial( 203 forward_pass_with_label, 204 model=model, 205 collator=collator, 206 num_classes=tags.num_classes, 207 ), 208 batched=True, 209 batch_size=8, 210 ) 211 df: pd.DataFrame = split_encoded.to_pandas() # type: ignore 212 213 df["tokens"] = df["input_ids"].apply( 214 lambda x: tokenizer.convert_ids_to_tokens(x) # type: ignore 215 ) 216 df["labels"] = df["labels"].apply( 217 lambda x: ["IGN" if i == -100 else tags.int2str(int(i)) for i in x] 218 ) 219 df["preds"] = df["preds"].apply(lambda x: [model.config.id2label[i] for i in x]) 220 df["preds"] = df.apply(lambda x: x["preds"][: len(x["input_ids"])], axis=1) 221 df["losses"] = df.apply(lambda x: x["losses"][: len(x["input_ids"])], axis=1) 222 df["hidden_states"] = df.apply(lambda x: x["hidden_states"][: len(x["input_ids"])], axis=1) 223 df["total_loss"] = df["losses"].apply(sum) 224 225 return df
Generates predictions for a given dataset split and returns the results as a dataframe.
Args
- split_encoded (Dataset): The dataset to process
- model: The model to process the dataset with
- tokenizer: The tokenizer to process the dataset with
- collator: The data collator to use
- tags: The tags used in the dataset
Returns
pd.DataFrame: A dataframe containing token-level predictions.