It took a lot of study on the HuggingFace site, but here is a cheat sheet on making a Q&amp;A AI script:
1.  Get Python going on your Windows or Linux box. At the time of writing this Python 3.10 had to be used for the AI packages to work.
2. Open a terminal and create a directory for development.
3. Get hugging face going:
<pre class="language-markup"><code>python -m pip install huggingface_hub
huggingface-cli login</code></pre>
4. Download a model by going to huggingface, visiting a models page and click the little copy button by its name.  Then do this:
<code>huggingface-cli download packagename --local-dir .</code>
5. Make a content.txt file.  It should be one line only, but can be multiple paragraphs such as from a story or book passage.
6. Create your Python script and execute it:
<pre class="language-python"><code>import transformers
import torch
from transformers import pipeline

# See https://huggingface.co/tasks/question-answering
# Function to read question from a file
def read_question_from_file(file_path):
 with open(file_path, 'r') as file:
 return file.read()
 
def get_response(question):
 result = qa_model(question=question, context=context) # {'answer': 'İstanbul', 'end': 39, 'score': 0.953, 'start': 31}
 return (result['answer'])

bert_pipeline = transformers.pipeline(
 "question-answering",
 model='distilbert-base-cased-distilled-squad' # this will result in it downloading to your user .cache directory
 # preferrable, the local model address can be used:
 # model=r'C:\Users\Sean-\.cache\huggingface\hub\models--distilbert--distilbert-base-cased-distilled-squad\snapshots\564e9b582944a57a3e586bbb98fd6f0a4118db7f'
)

print("setting model")

qa_model = bert_pipeline
context = read_question_from_file('content.txt') # a one line text file with paragraphs of content you want to ask questions on.

if __name__ == "__main__":
 print("\n\nCLOVER is online! Type 'exit' to quit.")
 
 while True:
 user_input = input("You: ")
 if user_input.lower() == "exit":
 print("Goodbye!\n\n")
 break

 response = get_response(user_input)
 print(f"\n\nCLOVER: {response}")</code></pre>
 
Another example:
This script will take a file named documents.json, train with BERT on it, and then use T5 to give answers. The json file is after the script.
<pre class="language-python"><code>import json
import torch
import torch.nn.functional as F
from transformers import AutoModelForMaskedLM, AutoModelForQuestionAnswering, AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import os

# Set number of threads for parallel processing
num_cores = 12
torch.set_num_threads(num_cores)
os.environ["OMP_NUM_THREADS"] = str(num_cores)
os.environ["MKL_NUM_THREADS"] = str(num_cores)

# Load JSON data
with open('documents.json', 'r') as file:
 data = json.load(file)

# Convert JSON data to a DataFrame
df = pd.DataFrame(data)

# Convert DataFrame to Dataset
dataset = Dataset.from_pandas(df)

# Split the dataset into training and evaluation sets
train_df, eval_df = train_test_split(dataset.to_pandas(), test_size=0.2)

# Convert back to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

# Load pre-trained model and tokenizer for MaskedLM
masked_lm_model_name = "deepset/bert-large-uncased-whole-word-masking-squad2"
masked_lm_model = AutoModelForMaskedLM.from_pretrained(masked_lm_model_name)
masked_lm_tokenizer = AutoTokenizer.from_pretrained(masked_lm_model_name)

# Preprocess data
def preprocess_function(examples):
 inputs = masked_lm_tokenizer(examples['content'], padding='max_length', truncation=True, max_length=512)
 inputs['labels'] = inputs['input_ids'] # Masked language model needs labels for masked tokens
 return inputs

# Apply the preprocessing function to your datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

# Training arguments
training_args = TrainingArguments(
 output_dir='./FineTunedDocs', # Set output directory to FineTunedDocs
 num_train_epochs=10, # Training epochs
 per_device_train_batch_size=8,
 evaluation_strategy='epoch',
 save_strategy='epoch',
 load_best_model_at_end=True,
 metric_for_best_model='eval_loss',
 greater_is_better=False, # Indicates that lower is better for the eval_loss metric
 logging_dir='./logs', # Directory for storing logs
 logging_steps=10,
)

# Define the trainer for MaskedLM
trainer = Trainer(
 model=masked_lm_model,
 args=training_args,
 train_dataset=train_dataset,
 eval_dataset=eval_dataset,
 callbacks=[EarlyStoppingCallback(early_stopping_patience=3)] # Patience can be adjusted as needed
)

# Fine-tune the MaskedLM model
trainer.train()

# Save the fine-tuned MaskedLM model
masked_lm_model.save_pretrained('FineTunedDocs')
masked_lm_tokenizer.save_pretrained('FineTunedDocs')

# Load fine-tuned models and tokenizers
bert_model_name = "FineTunedDocs"
bert_model = AutoModelForQuestionAnswering.from_pretrained(bert_model_name)
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

t5_model_name = "t5-base"
t5_model = AutoModelForSeq2SeqLM.from_pretrained(t5_model_name)
t5_tokenizer = AutoTokenizer.from_pretrained(t5_model_name)

# Ensure both models are in evaluation mode
bert_model.eval()
t5_model.eval()

# Load JSON data for QA
with open('documents.json', 'r') as file:
 documents = json.load(file)

# Combine the content of all documents into a single context
context = " ".join([doc['content'] for doc in documents])

def extract_relevant_text(question, context):
 # Tokenize input and ensure the context length is within the model's limit
 inputs = bert_tokenizer(question, context[:512], return_tensors='pt', padding='max_length', truncation=True)
 # Get model output
 with torch.no_grad():
 outputs = bert_model(**inputs)
 
 # Get the answer span (start and end positions)
 start_logits = outputs.start_logits
 end_logits = outputs.end_logits

 start_probs = F.softmax(start_logits, dim=-1)
 end_probs = F.softmax(end_logits, dim=-1)
 
 answer_start = torch.argmax(start_logits)
 answer_end = torch.argmax(end_logits) + 1

 answer_prob = (start_probs[0, answer_start] * end_probs[0, answer_end]).item()
 
 # Decode the answer span
 extracted_text = bert_tokenizer.convert_tokens_to_string(bert_tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))
 
 # Handle cases where extracted text is empty or invalid
 if not extracted_text.strip():
 extracted_text = "Sorry, I couldn't find a relevant answer."

 return extracted_text, answer_prob


def generate_human_like_response(question, extracted_text):
 # Concatenate question and extracted text
 input_text = f"question: {question} extracted_text: {extracted_text}"
 # Tokenize input
 inputs = t5_tokenizer(input_text, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
 # Generate response
 with torch.no_grad():
 outputs = t5_model.generate(inputs['input_ids'], max_length=150)
 # Decode the response
 response = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 # Handle cases where the generated response is not coherent
 if not response.strip():
 response = "Sorry, I couldn't generate a coherent response."

 return response

def main():
 print("Welcome to the QA responder. Type 'exit' to quit.")
 while True:
 user_input = input("Enter your question: ")
 if user_input.lower() == "exit":
 print("Goodbye!")
 break
 # Extract relevant text with BERT
 extracted_text, probability = extract_relevant_text(user_input, context)
 print(extracted_text)
 # Generate a human-like response with T5
 if probability &lt; 0.000005: # You can set a threshold for confidence
 response = "I don't know."
 else:
 response = generate_human_like_response(user_input, extracted_text)
 print(f"Answer: {response}")
 print(f"Confidence: {probability:.2f}")

if __name__ == "__main__":
 main()
</code></pre>
 
<pre class="language-javascript"><code>[
 {
 "content": "When Connor was 5, he thought it would be cool for us to make an R2D2."
 },
 {
 "content": "Little did we know it would change the trajectory of both our lives for the next 12 years."
 },
 {
 "content": "We went on to a journey to learn all Maker skills."
 },
 {
 "content": "Although, as his dad, I brought to the table years of programming and practical experience working on my cars and homes, I learned along with him on electronics and metal working."
 },
 {
 "content": "Our journey continued through his teen years. He has now launched to college majoring in Mechanical Engineering."
 },
 {
 "content": "Along the way, we learned 3D printing, how to solder, how to weld, how to design with Autodesk Fusion 360, and on and on."
 },
 {
 "content": "The projects covered home maintenance and construction, mechatronics, IoT, and automative repair."
 },
 {
 "content": "We also matched our projects to life principles to help illuminate the best person he could become."
 },
 {
 "content": "We consider our time on these projects very well spent. It gave him insights and a life plan for the world ahead."
 },
 {
 "content": "The result is a great repository for new parents to take on such a journey with their children."
 },
 {
 "content": "Or, at the least, it's a great site to find great Maker projects and learn to become a jack of all trades (which is way better than just a master of one)."
 }
]
</code></pre>
 
 
<h1>LLAMA</h1>
This code uses the Llama 8B.  You download it locally.
<pre class="language-python"><code>import transformers
import torch
import os
import time

# model_id = "meta-llama/Llama-3.1-8B-Instruct"
model_id = r"D:\AI\llama8"
# Set the number of threads to the number of available CPU cores of your computer
# took 29 minutes on the HTPC

num_cores = 12 # IMPORTANT!
torch.set_num_threads(num_cores)
os.environ["OMP_NUM_THREADS"] = str(num_cores)
os.environ["MKL_NUM_THREADS"] = str(num_cores)

pipeline = transformers.pipeline(
 "text-generation",
 model=model_id,
 model_kwargs={"torch_dtype": torch.bfloat16},
 device_map="auto",
)

def get_response(question):
 start_time = time.time()
 print(start_time)
 messages = [
 
 {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
 {"role": "user", "content": question}
 ]
 print("checking")
 outputs = pipeline(
 messages,
 max_new_tokens=256,
 )

 end_time = time.time()
 # Calculate the total time taken
 total_time = end_time - start_time
 minutes, seconds = divmod(total_time, 60)

 print(f"Total time taken: {int(minutes)} minutes and {seconds:.2f} seconds")
 return (outputs[0]["generated_text"][-1])

if __name__ == "__main__":
 print("\n\nCLOVER is online! Type 'exit' to quit.")
 
 while True:
 user_input = input("You: ")
 if user_input.lower() == "exit":
 print("Goodbye!\n\n")
 break

 response = get_response(user_input)
 print(f"\n\nCLOVER: {response}")
</code></pre>