It took a lot of study on the HuggingFace site, but here is a cheat sheet on making a Q&A AI script:

1. Get Python going on your Windows or Linux box. At the time of writing this Python 3.10 had to be used for the AI packages to work.

2. Open a terminal and create a directory for development.

3. Get hugging face going:

python -m pip install huggingface_hub
huggingface-cli login

4. Download a model by going to huggingface, visiting a models page and click the little copy button by its name. Then do this:

huggingface-cli download packagename --local-dir .

5. Make a content.txt file. It should be one line only, but can be multiple paragraphs such as from a story or book passage.

6. Create your Python script and execute it:

import transformers
import torch
from transformers import pipeline

# See https://huggingface.co/tasks/question-answering
# Function to read question from a file
def read_question_from_file(file_path):
    with open(file_path, 'r') as file:
        return file.read()
    
def get_response(question):
    result = qa_model(question=question, context=context) # {'answer': 'İstanbul', 'end': 39, 'score': 0.953, 'start': 31}
    return (result['answer'])

bert_pipeline = transformers.pipeline(
    "question-answering",
    model='distilbert-base-cased-distilled-squad' # this will result in it downloading to your user .cache directory
    # preferrable, the local model address can be used:
    # model=r'C:\Users\Sean-\.cache\huggingface\hub\models--distilbert--distilbert-base-cased-distilled-squad\snapshots\564e9b582944a57a3e586bbb98fd6f0a4118db7f'
)

print("setting model")

qa_model = bert_pipeline
context = read_question_from_file('content.txt') # a one line text file with paragraphs of content you want to ask questions on.

if __name__ == "__main__":
    print("\n\nCLOVER is online! Type 'exit' to quit.")
    
    while True:
        user_input = input("You: ")
        if user_input.lower() == "exit":
            print("Goodbye!\n\n")
            break

        response = get_response(user_input)
        print(f"\n\nCLOVER: {response}")

Another example:

This script will take a file named documents.json, train with BERT on it, and then use T5 to give answers. The json file is after the script.

import json
import torch
import torch.nn.functional as F
from transformers import AutoModelForMaskedLM, AutoModelForQuestionAnswering, AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import os

# Set number of threads for parallel processing
num_cores = 12
torch.set_num_threads(num_cores)
os.environ["OMP_NUM_THREADS"] = str(num_cores)
os.environ["MKL_NUM_THREADS"] = str(num_cores)

# Load JSON data
with open('documents.json', 'r') as file:
    data = json.load(file)

# Convert JSON data to a DataFrame
df = pd.DataFrame(data)

# Convert DataFrame to Dataset
dataset = Dataset.from_pandas(df)

# Split the dataset into training and evaluation sets
train_df, eval_df = train_test_split(dataset.to_pandas(), test_size=0.2)

# Convert back to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

# Load pre-trained model and tokenizer for MaskedLM
masked_lm_model_name = "deepset/bert-large-uncased-whole-word-masking-squad2"
masked_lm_model = AutoModelForMaskedLM.from_pretrained(masked_lm_model_name)
masked_lm_tokenizer = AutoTokenizer.from_pretrained(masked_lm_model_name)

# Preprocess data
def preprocess_function(examples):
    inputs = masked_lm_tokenizer(examples['content'], padding='max_length', truncation=True, max_length=512)
    inputs['labels'] = inputs['input_ids']  # Masked language model needs labels for masked tokens
    return inputs

# Apply the preprocessing function to your datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir='./FineTunedDocs',  # Set output directory to FineTunedDocs
    num_train_epochs=10,  # Training epochs
    per_device_train_batch_size=8,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,  # Indicates that lower is better for the eval_loss metric
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=10,
)

# Define the trainer for MaskedLM
trainer = Trainer(
    model=masked_lm_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Patience can be adjusted as needed
)

# Fine-tune the MaskedLM model
trainer.train()

# Save the fine-tuned MaskedLM model
masked_lm_model.save_pretrained('FineTunedDocs')
masked_lm_tokenizer.save_pretrained('FineTunedDocs')

# Load fine-tuned models and tokenizers
bert_model_name = "FineTunedDocs"
bert_model = AutoModelForQuestionAnswering.from_pretrained(bert_model_name)
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

t5_model_name = "t5-base"
t5_model = AutoModelForSeq2SeqLM.from_pretrained(t5_model_name)
t5_tokenizer = AutoTokenizer.from_pretrained(t5_model_name)

# Ensure both models are in evaluation mode
bert_model.eval()
t5_model.eval()

# Load JSON data for QA
with open('documents.json', 'r') as file:
    documents = json.load(file)

# Combine the content of all documents into a single context
context = " ".join([doc['content'] for doc in documents])

def extract_relevant_text(question, context):
    # Tokenize input and ensure the context length is within the model's limit
    inputs = bert_tokenizer(question, context[:512], return_tensors='pt', padding='max_length', truncation=True)
    # Get model output
    with torch.no_grad():
        outputs = bert_model(**inputs)
    
    # Get the answer span (start and end positions)
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    start_probs = F.softmax(start_logits, dim=-1)
    end_probs = F.softmax(end_logits, dim=-1)
    
    answer_start = torch.argmax(start_logits)
    answer_end = torch.argmax(end_logits) + 1

    answer_prob = (start_probs[0, answer_start] * end_probs[0, answer_end]).item()
    
    # Decode the answer span
    extracted_text = bert_tokenizer.convert_tokens_to_string(bert_tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))
    
    # Handle cases where extracted text is empty or invalid
    if not extracted_text.strip():
        extracted_text = "Sorry, I couldn't find a relevant answer."

    return extracted_text, answer_prob


def generate_human_like_response(question, extracted_text):
    # Concatenate question and extracted text
    input_text = f"question: {question} extracted_text: {extracted_text}"
    # Tokenize input
    inputs = t5_tokenizer(input_text, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
    # Generate response
    with torch.no_grad():
        outputs = t5_model.generate(inputs['input_ids'], max_length=150)
    # Decode the response
    response = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Handle cases where the generated response is not coherent
    if not response.strip():
        response = "Sorry, I couldn't generate a coherent response."

    return response

def main():
    print("Welcome to the QA responder. Type 'exit' to quit.")
    while True:
        user_input = input("Enter your question: ")
        if user_input.lower() == "exit":
            print("Goodbye!")
            break
        # Extract relevant text with BERT
        extracted_text, probability = extract_relevant_text(user_input, context)
        print(extracted_text)
        # Generate a human-like response with T5
        if probability < 0.000005:  # You can set a threshold for confidence
            response = "I don't know."
        else:
            response = generate_human_like_response(user_input, extracted_text)
        print(f"Answer: {response}")
        print(f"Confidence: {probability:.2f}")

if __name__ == "__main__":
    main()

[
    {
        "content": "When Connor was 5, he thought it would be cool for us to make an R2D2."
    },
    {
        "content": "Little did we know it would change the trajectory of both our lives for the next 12 years."
    },
    {
        "content": "We went on to a journey to learn all Maker skills."
    },
    {
        "content": "Although, as his dad, I brought to the table years of programming and practical experience working on my cars and homes, I learned along with him on electronics and metal working."
    },
    {
        "content": "Our journey continued through his teen years. He has now launched to college majoring in Mechanical Engineering."
    },
    {
        "content": "Along the way, we learned 3D printing, how to solder, how to weld, how to design with Autodesk Fusion 360, and on and on."
    },
    {
        "content": "The projects covered home maintenance and construction, mechatronics, IoT, and automative repair."
    },
    {
        "content": "We also matched our projects to life principles to help illuminate the best person he could become."
    },
    {
        "content": "We consider our time on these projects very well spent. It gave him insights and a life plan for the world ahead."
    },
    {
        "content": "The result is a great repository for new parents to take on such a journey with their children."
    },
    {
        "content": "Or, at the least, it's a great site to find great Maker projects and learn to become a jack of all trades (which is way better than just a master of one)."
    }
]

LLAMA

This code uses the Llama 8B. You download it locally.

import transformers
import torch
import os
import time

# model_id = "meta-llama/Llama-3.1-8B-Instruct"
model_id = r"D:\AI\llama8"
# Set the number of threads to the number of available CPU cores of your computer
# took 29 minutes on the HTPC

num_cores = 12  # IMPORTANT!
torch.set_num_threads(num_cores)
os.environ["OMP_NUM_THREADS"] = str(num_cores)
os.environ["MKL_NUM_THREADS"] = str(num_cores)

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

def get_response(question):
    start_time = time.time()
    print(start_time)
    messages = [
        
        {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
        {"role": "user", "content": question}
    ]
    print("checking")
    outputs = pipeline(
        messages,
        max_new_tokens=256,
    )

    end_time = time.time()
    # Calculate the total time taken
    total_time = end_time - start_time
    minutes, seconds = divmod(total_time, 60)

    print(f"Total time taken: {int(minutes)} minutes and {seconds:.2f} seconds")
    return (outputs[0]["generated_text"][-1])

if __name__ == "__main__":
    print("\n\nCLOVER is online! Type 'exit' to quit.")
    
    while True:
        user_input = input("You: ")
        if user_input.lower() == "exit":
            print("Goodbye!\n\n")
            break

        response = get_response(user_input)
        print(f"\n\nCLOVER: {response}")

Experimenting with AI Large Language Models

LLAMA