from transformers import AutoTokenizer, AutoModelForSequenceClassification
import transformers
import torch


checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

print(tokenizer("Using a Transformer network is simple"))

#  tokenizer.save_pretrained("directory_on_my_computer")

{'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}


sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)
print(tokens)

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']


# this tokenization is the output from ber-base-cased! changing that will also change the tokenization
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print(input_ids) # are the mapping to the vocabulary (must be aware of there a related to a particular vocab)

[7993, 170, 13809, 23763, 2443, 1110, 3014]


final_inputs = tokenizer.prepare_for_model(input_ids)
print(final_inputs["input_ids"])

[101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102]


print(tokenizer.decode(final_inputs["input_ids"]))

[CLS] Using a Transformer network is simple [SEP]


model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]

batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

tensor([[-0.4969,  0.1369]], grad_fn=<AddmmBackward0>)
tensor([[-0.4789,  0.1862]], grad_fn=<AddmmBackward0>)
tensor([[-0.4969,  0.1369],
        [-0.6601,  0.0472]], grad_fn=<AddmmBackward0>)


batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]


outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)

tensor([[-0.4969,  0.1369],
        [-0.4789,  0.1862]], grad_fn=<AddmmBackward0>)


# the padding can be done in several different ways:
# Will pad the sequences up to the maximum sequence length
sequences = ["Using a Transformer network is simple", "Using a Transformer network i", "Using a Transformer "]
model_inputs = tokenizer(sequences, padding="longest")

# Will pad the sequences up to the model max length
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, padding="max_length")

# Will pad the sequences up to the specified max length
model_inputs = tokenizer(sequences, padding="max_length", max_length=8)

# Will truncate the sequences that are longer than the model max length
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, truncation=True)

# Will truncate the sequences that are longer than the specified max length
model_inputs = tokenizer(sequences, max_length=8, truncation=True)


# wrapping up
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


from datasets import load_dataset

raw_datasets = load_dataset("code_search_net", "python")

# Using a Python generator, we can avoid Python loading anything into memory until it’s actually necessary. 
# To create such a generator, you just to need to replace the brackets with parentheses:

def get_training_corpus():
    return (raw_datasets["train"][i : i + 1000]["whole_func_string"] for i in range(0, len(raw_datasets["train"]), 1000))

# or better
def get_training_corpus():
    dataset = raw_datasets["train"]
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples["whole_func_string"]

training_corpus = get_training_corpus()


from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)

# 52000 is the corpus lenght!
# Note that AutoTokenizer.train_new_from_iterator() only works if the tokenizer you are using is a “fast” tokenizer.

tokenizer.save_pretrained("code-search-net-tokenizer")


# tokenizer = AutoTokenizerFast.from_pretrained("")
# text_normalized = tokenizer.backend_tokenizer.normalizer.normalize_str(text) # to check how this operation is performed!
# pre_tokenization = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, how are  you?")


from datasets import load_dataset

# argument to pass to pandas.read_csv()
# data_files could also be a url

# csv
local_csv = load_dataset("csv", data_files="path-to-file.csv", sep=",")

# json single obj
local_csv = load_dataset("json", data_files="path-to-file.csv", field="data")

# json multiple files
data_files = {"train": f"{url}train.json", "test": f"{url}test-json"}
local_csv = load_dataset("json", data_files=data_files, field="data")


# train_test_split
dataset = squad.train_test_split(test_size = 0.1)

# select and shuffle
indices = [0,10,20,40, 15]
squad.shuffle().select(indices)

# filter the dataframe
squad_filtered = squad.filter(lambda x:x["title"].startswith("L"))

# flatten
squad.flatten()
# we have that answers is nested into text and answer_start, with flatten we bring them out ['answers.text', 'answers.answer_start'].

# map 
def lower_case(ex):
    return {"title": ex["title"].lower()}

squad_lower = squad.map(lower_case, batched=True)
## Using Dataset.map() with batched=True will be essential to unlock the speed of the “fast” tokenizers 

# Renaming and filtering
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)
drug_dataset = drug_dataset.rename_column(original_column_name="Unnamed: 0", new_column_name="patient_id")
drug_dataset = drug_dataset.map(lowercase_condition)

# Create new columns
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}

drug_dataset = drug_dataset.map(compute_review_length)

# sort values
drug_dataset["train"].sort("review_length")[:3]

# removing emoji
drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})


# convert into pandas dataframe!
dataset.set_format("pandas")
# easier way
dataset.to_pandas()
# back to original
dataset.reset_format()
# or
from datasets import Dataset
freq_dataset = Dataset.from_pandas(frequencies)


# create train-test-validation

drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42)
# Rename the default "test" split to "validation"
drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")
# Add the "test" set to our `DatasetDict`
drug_dataset_clean["test"] = drug_dataset["test"]


# save and load datasets

# arrow format
drug_dataset_clean.save_to_disk("path") # save
drug_arrow_load = load_from_disk("path") # load

# csv
for split, dataset in raw_dataset.items():
    dataset.to_csv(f"myDataset-{split}.csv", index=None) #save

data_files  = {"train": "myDataset-train.csv", "test": "myDataset-test.csv" }
load_dataset("csv", data_files=data_files)


from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification
import torch

# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)


from datasets import load_dataset
raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

Reusing dataset glue (/root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})


# As you can see, we get a DatasetDict object which contains the training set, 
# the validation set, and the test set. Each of those contains several columns 
# (sentence1, sentence2, label, and idx) and a variable number of rows, which are
#  the number of elements in each set (so, there are 3,668 pairs of sentences in 
# the training set, 408 in the validation set, and 1,725 in the test set).

raw_train_dataset = raw_datasets["train"]
print(raw_train_dataset[0])
raw_train_dataset.features

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0}

{'idx': Value(dtype='int32', id=None),
 'label': ClassLabel(num_classes=2, names=['not_equivalent', 'equivalent'], names_file=None, id=None),
 'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None)}


def tokenize_wrapper(obs):
    return tokenizer (
        obs["sentence1"], obs["sentence2"], padding='max_length', truncation=True
    )

tokenized_dataset = raw_datasets.map(tokenize_wrapper, batched=True)
print(tokenized_dataset.column_names)
tokenized_dataset = tokenized_dataset.remove_columns(["idx","sentence1", "sentence2"])
tokenized_dataset = tokenized_dataset.rename_column("label","labels")
tokenized_dataset = tokenized_dataset.with_format("torch")
tokenized_dataset

Loading cached processed dataset at /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-99d1d2a4040377ab.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-0c0caf4c25019c57.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-dc680b012cb80bae.arrow

{'train': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'], 'validation': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'], 'test': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask']}

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})


from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

samples = tokenized_dataset["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]]

[512, 512, 512, 512, 512, 512, 512, 512]

HuggingFace Transformers

Introduction¶

Working with pipelines¶

Zero-shot classification¶

Text-generation¶

Mask-filling¶

Named entity recognition¶

Question answering¶

Language Models¶

Transfer Learning¶

Transformers¶

Architectures introduction¶

Architectures vs. checkpoints¶

Encoder¶

Decoder¶

Encoder-Decoder¶

HuggingFace Models¶

Tokenizer API¶

Going trough the model¶

AutoModelFor*¶

Models¶

Config and fresh models¶

Load pretrained models¶

Saving the model¶

Tokenizer API¶

Tokenizer Techniques¶

Word-based tokenizer¶

Character-based¶

Subword-based¶

Tokenizer Process¶

Padding sentences¶

Longer sequences¶

Tokenizer-library¶

Datasets¶

How to load custom dataset¶

Pandas integration¶

Huggingface datasets¶

Dataset Processing¶

Dynamic Padding¶