src.data

  1from functools import partial
  2
  3import pandas as pd
  4import streamlit as st
  5import torch
  6from datasets import Dataset, DatasetDict, load_dataset  # type: ignore
  7from torch.nn.functional import cross_entropy
  8from transformers import DataCollatorForTokenClassification  # type: ignore
  9
 10from src.utils import device, tokenizer_hash_funcs
 11
 12
 13@st.cache(allow_output_mutation=True)
 14def get_data(ds_name: str, config_name: str, split_name: str, split_sample_size: int) -> Dataset:
 15    """Loads a Dataset from the HuggingFace hub (if not already loaded).
 16
 17    Uses `datasets.load_dataset` to load the dataset (see its documentation for additional details).
 18
 19    Args:
 20        ds_name (str): Path or name of the dataset.
 21        config_name (str): Name of the dataset configuration.
 22        split_name (str): Which split of the data to load.
 23        split_sample_size (int): The number of examples to load from the split.
 24
 25    Returns:
 26        Dataset: A Dataset object.
 27    """
 28    ds: DatasetDict = load_dataset(ds_name, name=config_name, use_auth_token=True).shuffle(seed=0)  # type: ignore
 29    split = ds[split_name].select(range(split_sample_size))
 30    return split
 31
 32
 33@st.cache(
 34    allow_output_mutation=True,
 35    hash_funcs=tokenizer_hash_funcs,
 36)
 37def get_collator(tokenizer) -> DataCollatorForTokenClassification:
 38    """Returns a DataCollator that will dynamically pad the inputs received, as well as the labels.
 39
 40    Args:
 41        tokenizer ([PreTrainedTokenizer] or [PreTrainedTokenizerFast]): The tokenizer used for encoding the data.
 42
 43    Returns:
 44        DataCollatorForTokenClassification: The DataCollatorForTokenClassification object.
 45    """
 46    return DataCollatorForTokenClassification(tokenizer)
 47
 48
 49def create_word_ids_from_input_ids(tokenizer, input_ids: list[int]) -> list[int]:
 50    """Takes a list of input_ids and return corresponding word_ids
 51
 52    Args:
 53        tokenizer: The tokenizer that was used to obtain the input ids.
 54        input_ids (list[int]): List of token ids.
 55
 56    Returns:
 57        list[int]: Word ids corresponding to the input ids.
 58    """
 59    word_ids = []
 60    wid = -1
 61    tokens = [tokenizer.convert_ids_to_tokens(i) for i in input_ids]
 62
 63    for i, tok in enumerate(tokens):
 64        if tok in tokenizer.all_special_tokens:
 65            word_ids.append(-1)
 66            continue
 67
 68        if not tokens[i - 1].endswith("@@") and tokens[i - 1] != "<unk>":
 69            wid += 1
 70
 71        word_ids.append(wid)
 72
 73    assert len(word_ids) == len(input_ids)
 74    return word_ids
 75
 76
 77def tokenize(batch, tokenizer) -> dict:
 78    """Tokenizes a batch of examples.
 79
 80    Args:
 81        batch: The examples to tokenize
 82        tokenizer: The tokenizer to use
 83
 84    Returns:
 85        dict: The tokenized batch
 86    """
 87    tokenized_inputs = tokenizer(batch["tokens"], truncation=True, is_split_into_words=True)
 88    labels = []
 89    wids = []
 90
 91    for idx, label in enumerate(batch["ner_tags"]):
 92        try:
 93            word_ids = tokenized_inputs.word_ids(batch_index=idx)
 94        except ValueError:
 95            word_ids = create_word_ids_from_input_ids(
 96                tokenizer, tokenized_inputs["input_ids"][idx]
 97            )
 98        previous_word_idx = None
 99        label_ids = []
100        for word_idx in word_ids:
101            if word_idx == -1 or word_idx is None or word_idx == previous_word_idx:
102                label_ids.append(-100)
103            else:
104                label_ids.append(label[word_idx])
105            previous_word_idx = word_idx
106        wids.append(word_ids)
107        labels.append(label_ids)
108    tokenized_inputs["word_ids"] = wids
109    tokenized_inputs["labels"] = labels
110    return tokenized_inputs
111
112
113def stringify_ner_tags(batch: dict, tags) -> dict:
114    """Stringifies a dataset batch's NER tags."""
115    return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}
116
117
118def encode_dataset(split: Dataset, tokenizer):
119    """Encodes a dataset split.
120
121    Args:
122        split (Dataset): A Dataset object.
123        tokenizer: A PreTrainedTokenizer object.
124
125    Returns:
126        Dataset: A Dataset object with the encoded inputs.
127    """
128
129    tags = split.features["ner_tags"].feature
130    split = split.map(partial(stringify_ner_tags, tags=tags), batched=True)
131    remove_columns = split.column_names
132    ids = split["id"]
133    split = split.map(
134        partial(tokenize, tokenizer=tokenizer),
135        batched=True,
136        remove_columns=remove_columns,
137    )
138    word_ids = [[id if id is not None else -1 for id in wids] for wids in split["word_ids"]]
139    return split.remove_columns(["word_ids"]), word_ids, ids
140
141
142def forward_pass_with_label(batch, model, collator, num_classes: int) -> dict:
143    """Runs the forward pass for a batch of examples.
144
145    Args:
146        batch: The batch to process
147        model: The model to process the batch with
148        collator: A data collator
149        num_classes (int): Number of classes
150
151    Returns:
152        dict: a dictionary containing `losses`, `preds` and `hidden_states`
153    """
154
155    # Convert dict of lists to list of dicts suitable for data collator
156    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
157
158    # Pad inputs and labels and put all tensors on device
159    batch = collator(features)
160    input_ids = batch["input_ids"].to(device)
161    attention_mask = batch["attention_mask"].to(device)
162    labels = batch["labels"].to(device)
163
164    with torch.no_grad():
165        # Pass data through model
166        output = model(input_ids, attention_mask, output_hidden_states=True)
167        # logit.size: [batch_size, sequence_length, classes]
168
169        # Predict class with largest logit value on classes axis
170        preds = torch.argmax(output.logits, axis=-1).cpu().numpy()  # type: ignore
171
172        # Calculate loss per token after flattening batch dimension with view
173        loss = cross_entropy(
174            output.logits.view(-1, num_classes), labels.view(-1), reduction="none"
175        )
176
177        # Unflatten batch dimension and convert to numpy array
178        loss = loss.view(len(input_ids), -1).cpu().numpy()
179        hidden_states = output.hidden_states[-1].cpu().numpy()
180
181        # logits = output.logits.view(len(input_ids), -1).cpu().numpy()
182
183    return {"losses": loss, "preds": preds, "hidden_states": hidden_states}
184
185
186def predict(split_encoded: Dataset, model, tokenizer, collator, tags) -> pd.DataFrame:
187    """Generates predictions for a given dataset split and returns the results as a dataframe.
188
189    Args:
190        split_encoded (Dataset): The dataset to process
191        model: The model to process the dataset with
192        tokenizer: The tokenizer to process the dataset with
193        collator: The data collator to use
194        tags: The tags used in the dataset
195
196    Returns:
197        pd.DataFrame: A dataframe containing token-level predictions.
198    """
199
200    split_encoded = split_encoded.map(
201        partial(
202            forward_pass_with_label,
203            model=model,
204            collator=collator,
205            num_classes=tags.num_classes,
206        ),
207        batched=True,
208        batch_size=8,
209    )
210    df: pd.DataFrame = split_encoded.to_pandas()  # type: ignore
211
212    df["tokens"] = df["input_ids"].apply(
213        lambda x: tokenizer.convert_ids_to_tokens(x)  # type: ignore
214    )
215    df["labels"] = df["labels"].apply(
216        lambda x: ["IGN" if i == -100 else tags.int2str(int(i)) for i in x]
217    )
218    df["preds"] = df["preds"].apply(lambda x: [model.config.id2label[i] for i in x])
219    df["preds"] = df.apply(lambda x: x["preds"][: len(x["input_ids"])], axis=1)
220    df["losses"] = df.apply(lambda x: x["losses"][: len(x["input_ids"])], axis=1)
221    df["hidden_states"] = df.apply(lambda x: x["hidden_states"][: len(x["input_ids"])], axis=1)
222    df["total_loss"] = df["losses"].apply(sum)
223
224    return df
@st.cache(allow_output_mutation=True)
def get_data( ds_name: str, config_name: str, split_name: str, split_sample_size: int) -> datasets.arrow_dataset.Dataset:
14@st.cache(allow_output_mutation=True)
15def get_data(ds_name: str, config_name: str, split_name: str, split_sample_size: int) -> Dataset:
16    """Loads a Dataset from the HuggingFace hub (if not already loaded).
17
18    Uses `datasets.load_dataset` to load the dataset (see its documentation for additional details).
19
20    Args:
21        ds_name (str): Path or name of the dataset.
22        config_name (str): Name of the dataset configuration.
23        split_name (str): Which split of the data to load.
24        split_sample_size (int): The number of examples to load from the split.
25
26    Returns:
27        Dataset: A Dataset object.
28    """
29    ds: DatasetDict = load_dataset(ds_name, name=config_name, use_auth_token=True).shuffle(seed=0)  # type: ignore
30    split = ds[split_name].select(range(split_sample_size))
31    return split

Loads a Dataset from the HuggingFace hub (if not already loaded).

Uses datasets.load_dataset to load the dataset (see its documentation for additional details).

Args
  • ds_name (str): Path or name of the dataset.
  • config_name (str): Name of the dataset configuration.
  • split_name (str): Which split of the data to load.
  • split_sample_size (int): The number of examples to load from the split.
Returns

Dataset: A Dataset object.

@st.cache(allow_output_mutation=True, hash_funcs=tokenizer_hash_funcs)
def get_collator( tokenizer) -> transformers.data.data_collator.DataCollatorForTokenClassification:
34@st.cache(
35    allow_output_mutation=True,
36    hash_funcs=tokenizer_hash_funcs,
37)
38def get_collator(tokenizer) -> DataCollatorForTokenClassification:
39    """Returns a DataCollator that will dynamically pad the inputs received, as well as the labels.
40
41    Args:
42        tokenizer ([PreTrainedTokenizer] or [PreTrainedTokenizerFast]): The tokenizer used for encoding the data.
43
44    Returns:
45        DataCollatorForTokenClassification: The DataCollatorForTokenClassification object.
46    """
47    return DataCollatorForTokenClassification(tokenizer)

Returns a DataCollator that will dynamically pad the inputs received, as well as the labels.

Args
  • tokenizer ([PreTrainedTokenizer] or [PreTrainedTokenizerFast]): The tokenizer used for encoding the data.
Returns

DataCollatorForTokenClassification: The DataCollatorForTokenClassification object.

def create_word_ids_from_input_ids(tokenizer, input_ids: list[int]) -> list[int]:
50def create_word_ids_from_input_ids(tokenizer, input_ids: list[int]) -> list[int]:
51    """Takes a list of input_ids and return corresponding word_ids
52
53    Args:
54        tokenizer: The tokenizer that was used to obtain the input ids.
55        input_ids (list[int]): List of token ids.
56
57    Returns:
58        list[int]: Word ids corresponding to the input ids.
59    """
60    word_ids = []
61    wid = -1
62    tokens = [tokenizer.convert_ids_to_tokens(i) for i in input_ids]
63
64    for i, tok in enumerate(tokens):
65        if tok in tokenizer.all_special_tokens:
66            word_ids.append(-1)
67            continue
68
69        if not tokens[i - 1].endswith("@@") and tokens[i - 1] != "<unk>":
70            wid += 1
71
72        word_ids.append(wid)
73
74    assert len(word_ids) == len(input_ids)
75    return word_ids

Takes a list of input_ids and return corresponding word_ids

Args
  • tokenizer: The tokenizer that was used to obtain the input ids.
  • input_ids (list[int]): List of token ids.
Returns

list[int]: Word ids corresponding to the input ids.

def tokenize(batch, tokenizer) -> dict:
 78def tokenize(batch, tokenizer) -> dict:
 79    """Tokenizes a batch of examples.
 80
 81    Args:
 82        batch: The examples to tokenize
 83        tokenizer: The tokenizer to use
 84
 85    Returns:
 86        dict: The tokenized batch
 87    """
 88    tokenized_inputs = tokenizer(batch["tokens"], truncation=True, is_split_into_words=True)
 89    labels = []
 90    wids = []
 91
 92    for idx, label in enumerate(batch["ner_tags"]):
 93        try:
 94            word_ids = tokenized_inputs.word_ids(batch_index=idx)
 95        except ValueError:
 96            word_ids = create_word_ids_from_input_ids(
 97                tokenizer, tokenized_inputs["input_ids"][idx]
 98            )
 99        previous_word_idx = None
100        label_ids = []
101        for word_idx in word_ids:
102            if word_idx == -1 or word_idx is None or word_idx == previous_word_idx:
103                label_ids.append(-100)
104            else:
105                label_ids.append(label[word_idx])
106            previous_word_idx = word_idx
107        wids.append(word_ids)
108        labels.append(label_ids)
109    tokenized_inputs["word_ids"] = wids
110    tokenized_inputs["labels"] = labels
111    return tokenized_inputs

Tokenizes a batch of examples.

Args
  • batch: The examples to tokenize
  • tokenizer: The tokenizer to use
Returns

dict: The tokenized batch

def stringify_ner_tags(batch: dict, tags) -> dict:
114def stringify_ner_tags(batch: dict, tags) -> dict:
115    """Stringifies a dataset batch's NER tags."""
116    return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}

Stringifies a dataset batch's NER tags.

def encode_dataset(split: datasets.arrow_dataset.Dataset, tokenizer)
119def encode_dataset(split: Dataset, tokenizer):
120    """Encodes a dataset split.
121
122    Args:
123        split (Dataset): A Dataset object.
124        tokenizer: A PreTrainedTokenizer object.
125
126    Returns:
127        Dataset: A Dataset object with the encoded inputs.
128    """
129
130    tags = split.features["ner_tags"].feature
131    split = split.map(partial(stringify_ner_tags, tags=tags), batched=True)
132    remove_columns = split.column_names
133    ids = split["id"]
134    split = split.map(
135        partial(tokenize, tokenizer=tokenizer),
136        batched=True,
137        remove_columns=remove_columns,
138    )
139    word_ids = [[id if id is not None else -1 for id in wids] for wids in split["word_ids"]]
140    return split.remove_columns(["word_ids"]), word_ids, ids

Encodes a dataset split.

Args
  • split (Dataset): A Dataset object.
  • tokenizer: A PreTrainedTokenizer object.
Returns

Dataset: A Dataset object with the encoded inputs.

def forward_pass_with_label(batch, model, collator, num_classes: int) -> dict:
143def forward_pass_with_label(batch, model, collator, num_classes: int) -> dict:
144    """Runs the forward pass for a batch of examples.
145
146    Args:
147        batch: The batch to process
148        model: The model to process the batch with
149        collator: A data collator
150        num_classes (int): Number of classes
151
152    Returns:
153        dict: a dictionary containing `losses`, `preds` and `hidden_states`
154    """
155
156    # Convert dict of lists to list of dicts suitable for data collator
157    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
158
159    # Pad inputs and labels and put all tensors on device
160    batch = collator(features)
161    input_ids = batch["input_ids"].to(device)
162    attention_mask = batch["attention_mask"].to(device)
163    labels = batch["labels"].to(device)
164
165    with torch.no_grad():
166        # Pass data through model
167        output = model(input_ids, attention_mask, output_hidden_states=True)
168        # logit.size: [batch_size, sequence_length, classes]
169
170        # Predict class with largest logit value on classes axis
171        preds = torch.argmax(output.logits, axis=-1).cpu().numpy()  # type: ignore
172
173        # Calculate loss per token after flattening batch dimension with view
174        loss = cross_entropy(
175            output.logits.view(-1, num_classes), labels.view(-1), reduction="none"
176        )
177
178        # Unflatten batch dimension and convert to numpy array
179        loss = loss.view(len(input_ids), -1).cpu().numpy()
180        hidden_states = output.hidden_states[-1].cpu().numpy()
181
182        # logits = output.logits.view(len(input_ids), -1).cpu().numpy()
183
184    return {"losses": loss, "preds": preds, "hidden_states": hidden_states}

Runs the forward pass for a batch of examples.

Args
  • batch: The batch to process
  • model: The model to process the batch with
  • collator: A data collator
  • num_classes (int): Number of classes
Returns

dict: a dictionary containing losses, preds and hidden_states

def predict( split_encoded: datasets.arrow_dataset.Dataset, model, tokenizer, collator, tags) -> pandas.core.frame.DataFrame:
187def predict(split_encoded: Dataset, model, tokenizer, collator, tags) -> pd.DataFrame:
188    """Generates predictions for a given dataset split and returns the results as a dataframe.
189
190    Args:
191        split_encoded (Dataset): The dataset to process
192        model: The model to process the dataset with
193        tokenizer: The tokenizer to process the dataset with
194        collator: The data collator to use
195        tags: The tags used in the dataset
196
197    Returns:
198        pd.DataFrame: A dataframe containing token-level predictions.
199    """
200
201    split_encoded = split_encoded.map(
202        partial(
203            forward_pass_with_label,
204            model=model,
205            collator=collator,
206            num_classes=tags.num_classes,
207        ),
208        batched=True,
209        batch_size=8,
210    )
211    df: pd.DataFrame = split_encoded.to_pandas()  # type: ignore
212
213    df["tokens"] = df["input_ids"].apply(
214        lambda x: tokenizer.convert_ids_to_tokens(x)  # type: ignore
215    )
216    df["labels"] = df["labels"].apply(
217        lambda x: ["IGN" if i == -100 else tags.int2str(int(i)) for i in x]
218    )
219    df["preds"] = df["preds"].apply(lambda x: [model.config.id2label[i] for i in x])
220    df["preds"] = df.apply(lambda x: x["preds"][: len(x["input_ids"])], axis=1)
221    df["losses"] = df.apply(lambda x: x["losses"][: len(x["input_ids"])], axis=1)
222    df["hidden_states"] = df.apply(lambda x: x["hidden_states"][: len(x["input_ids"])], axis=1)
223    df["total_loss"] = df["losses"].apply(sum)
224
225    return df

Generates predictions for a given dataset split and returns the results as a dataframe.

Args
  • split_encoded (Dataset): The dataset to process
  • model: The model to process the dataset with
  • tokenizer: The tokenizer to process the dataset with
  • collator: The data collator to use
  • tags: The tags used in the dataset
Returns

pd.DataFrame: A dataframe containing token-level predictions.