In [None]:
# Use a leading exclamation mark ! to change the code cell to treating the input as a shell script
! pip install datasets transformers
! pip install vllm
! pip install tqdm

### Download Dataset
### check the OR-Instruct-Data-3K dataset at https://huggingface.co/datasets/CardinalOperations/OR-Instruct-Data-3K

In [None]:
from datasets import load_dataset
dataset = load_dataset("CardinalOperations/OR-Instruct-Data-3K")

### Meet your dataset
This dataset consists of 2 columns:
1. Prompt (string): consists of system instructions
2. Completion (string): Consists of responses/answers to system instructors

In [None]:
dataset

In [None]:
dataset['train'][0] # Accessing the first record

### To deal with the case where the recod is difficult to read

In [None]:
def convert_to_template(data_dict):
    """
    Convert a dictionary with 'prompt' and 'completion' keys to the specified template format.

    Args:
        data_dict (dict): Dictionary containing 'prompt' and 'completion' keys

    Returns:
        str: Formatted string in the template format
    """
    template = r"""Below is an operations research question. Build a mathematical model and corresponding python code using `coptpy` that appropriately addresses the question.

# Question:
{Question}

# Response:
{Response}"""
    # Extract the question from the prompt (removing the fixed prefix)
    prompt_text = data_dict['prompt']
    question_start = prompt_text.find('# Question:') + len('# Question:')
    question_text = prompt_text[question_start:].strip()
    # Get the completion/response
    response_text = data_dict['completion']
    # Format using the template
    formatted_output = template.format(Question=question_text, Response=response_text)
    return formatted_output

def save_string_to_file(content, filename, mode='w', encoding='utf-8'):
    """
    Save a string to a text file.

    Args:
        content (str): The string content to save
        filename (str): The name/path of the file to create
        mode (str): File mode - 'w' for write (overwrite), 'a' for append
        encoding (str): File encoding (default: 'utf-8')

    Returns:
        bool: True if successful, False otherwise
    """
    try:
        with open(filename, mode, encoding=encoding) as file:
            file.write(content)
        print(f"Successfully saved content to {filename}")
        return True
    except Exception as e:
        print(f"Error saving file: {e}")
        return False

result = convert_to_template(dataset['train'][0])
save_string_to_file(result, "result.txt")

print(result)

### Calling copt solver to solve this optimizaiton problem

In [None]:
!pip install coptpy

In [None]:
import coptpy as cp
from coptpy import COPT

# Create a COPT environment
env = cp.Envr()

# Create a model
model = env.createModel("GeothermalPowerPlantScheduling")

# Define decision variables
x_A = model.addVar(lb=0, ub=1000, name="x_A")  # Extraction quantity of well A
x_B = model.addVar(lb=0, ub=1500, name="x_B")  # Extraction quantity of well B
x_C = model.addVar(lb=0, ub=2000, name="x_C")  # Extraction quantity of well C

# Define the objective function
model.setObjective(x_A + x_B + x_C - 5*x_A - 4*x_B - 3*x_C - 2*(x_A + x_B + x_C), sense=COPT.MAXIMIZE)

# Add constraints
model.addConstr(x_A + x_B + x_C == 2800, name="MarketDemand")  # Electricity market demand constraint
model.addConstr(x_A + x_B + x_C <= 3000, name="EquipmentCapacity")  # Equipment operating capacity constraint
model.addConstr(0.4*(x_A + x_B + x_C) <= x_A + x_B + x_C, name="ReinjectionRatio")  # Reinjection ratio constraint

# Solve the model
model.solve()

# Output the results
if model.status == COPT.OPTIMAL:
    print("Maximum total revenue: {:.2f}".format(model.objval))
    print("Scheduling plan:")
    print(f"Extraction quantity of well A: {x_A.x}")
    print(f"Extraction quantity of well B: {x_B.x}")
    print(f"Extraction quantity of well C: {x_C.x}")
else:
    print("No optimal solution found.")

In [None]:
# Display the feature structure of the training set
dataset['train'].features

In [None]:
# Get the total number of rows/examples in the training dataset
dataset['train'].num_rows

In [None]:
# Display the first prompt in the training dataset
dataset["train"]["completion"][0]

In [None]:
dataset["train"]["prompt"][0]

### Modify your dataset

In [None]:

# Randomly shuffle all rows in the training dataset while setting a random seed for reproducible results
shuffled_dataset = dataset["train"].shuffle(seed=42)
# Shuffle: to prevent learning order-based patterns

# Create a subset of the training dataset by selecting the first 5 rows using index range [0-4]
selected_dataset = dataset["train"].select(range(5))

# Split the training dataset into train and validation sets
# 80% of data goes to train_dataset
# 20% of data goes to valid_dataset (test_size=0.2)
# seed=42 ensures reproducible splitting
train_dataset, valid_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)

# Split dataset into multiple smaller parts (shards) for distributed processing
# - num_shards=5: Divide dataset into 5 equal parts
# - index=0: Select the first shard (indices 0, 5, 10, ...)
sharded_dataset = dataset["train"].shard(num_shards=5, index=0)
print("\nSharded dataset size:", len(sharded_dataset))
print("Original dataset size:", len(dataset["train"]))

### Saving and Exporting Data

In [None]:
dataset.save_to_disk("./")

In [None]:
dataset["train"].to_csv("./dataset.csv")
dataset["train"].to_json("./dataset.json")
dataset["train"].to_parquet("./dataset.parquet")

### Load a pretrained model and chat with it

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
model_id = "CardinalOperations/ORLM-LLaMA-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")

In [None]:

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=8000)

def my_convert_to_template(data_dict):
    """
    Convert prompt text to the specified template format.

    Args:
        data_dict (dict): Dictionary containing 'generated_text' key

    Returns:
        str: Formatted string in the template format
    """
    template = r"""Below is an operations research question. Build a mathematical model and corresponding python code using `coptpy` that appropriately addresses the question.

# Question:
{Question}

# Response:
{Response}"""

    # Extract the question from the prompt (removing the fixed prefix)
    prompt_text = data_dict['generated_text']
    question_start = prompt_text.find('# Question:')
    response_start = prompt_text.find("# Response:", question_start)# - len('# Response:')

    question_text = prompt_text[question_start + len('# Question:'):response_start].strip()
    response_text = prompt_text[response_start + len('# Response:'):].strip()
    # Format using the template
    formatted_output = template.format(Question=question_text, Response=response_text)
    return formatted_output

question = dataset["train"]["prompt"][0]
print(question)
answer = pipe(question)
print(answer)
formatted_answer = my_convert_to_template(answer[0])
print(formatted_answer)
