import pandas as pd import torch from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments from torch.utils.data import DataLoader # Check if CUDA (GPU support) is available device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f"Using device: {device}") # Load CSV data into a pandas DataFrame def load_data_from_csv(file_path): return pd.read_csv(file_path) # Initialize the BERT tokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Dataset Class class ProductMatchDataset(torch.utils.data.Dataset): def __init__(self, product_pairs, labels, tokenizer, max_length=128): self.product_pairs = product_pairs self.labels = labels self.tokenizer = tokenizer self.max_length = max_length def __len__(self): return len(self.product_pairs) def __getitem__(self, idx): product1, product2 = self.product_pairs[idx] # Tokenize (keep tensors on CPU) encoding = self.tokenizer( product1, product2, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt' ) encoding = {key: value.squeeze(0) for key, value in encoding.items()} encoding['labels'] = torch.tensor(self.labels[idx], dtype=torch.long) return encoding # Function to prepare data def prepare_data_for_finetuning(file_path): df = load_data_from_csv(file_path) product_pairs = list(zip(df['product_name_1'], df['product_name_2'])) labels = df['label'].tolist() return ProductMatchDataset(product_pairs, labels, tokenizer) # Load dataset file_path = 'Snoep.csv' dataset = prepare_data_for_finetuning(file_path) # Create DataLoader train_dataloader = DataLoader(dataset, batch_size=16, shuffle=True, num_workers=4, pin_memory=True) # Load pre-trained BERT model model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device) # Define Training Arguments training_args = TrainingArguments( output_dir='./results', num_train_epochs=3, per_device_train_batch_size=16, warmup_steps=500, weight_decay=0.01, logging_dir='./logs', fp16=True # Enable Mixed Precision Training for GPU ) # Initialize Trainer trainer = Trainer( model=model, args=training_args, train_dataset=dataset ) # Move batches to GPU inside training loop for batch in train_dataloader: batch = {key: value.to(device) for key, value in batch.items()} # Move batch to GPU # Train Model trainer.train(resume_from_checkpoint=True) # Save Model model.save_pretrained('./trained_model') tokenizer.save_pretrained('./trained_model')