import pandas as pd import mysql.connector def fetch_products_from_database(): # Connect to your MySQL database connection = mysql.connector.connect( host="localhost", user="almexx_test", password="!TpfNGIBU28G(TbW", database="almexx_test", unix_socket="/mnt/mysql/fatcow/mysql.sock" ) query = "SELECT product_name, product_description FROM products" df = pd.read_sql(query, connection) # Close the database connection connection.close() return df from transformers import BertTokenizer # Initialize the BERT tokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') def tokenize_product_pair(product1, product2, tokenizer, max_length=128): encoding = tokenizer( product1, product2, padding='max_length', # Pad to the max length truncation=True, # Truncate if longer than max_length max_length=max_length, return_tensors='pt' # Return PyTorch tensors ) # Ensure the input has a batch dimension (i.e., [batch_size, seq_length]) encoding = {key: value.unsqueeze(0) if value.dim() == 1 else value for key, value in encoding.items()} return encoding import torch from transformers import BertForSequenceClassification # Load the fine-tuned BERT model model = BertForSequenceClassification.from_pretrained('./trained_model') model.eval() # Set the model to evaluation mode def compute_similarity(product1, product2, model, tokenizer): # Tokenize the products encoding = tokenize_product_pair(product1, product2, tokenizer) # Pass the tokenized inputs to the model with torch.no_grad(): # Disable gradient computation for inference output = model(**encoding) # Get the logits (raw predictions) from the output logits = output.logits # Apply softmax to get probabilities probabilities = torch.nn.functional.softmax(logits, dim=-1) # The probability of them being a match (1 for match, 0 for no match) match_probability = probabilities[0, 1].item() return match_probability def find_best_match(new_product, database_df, model, tokenizer): best_match = None best_match_score = -1 # A low score for initialization for index, row in database_df.iterrows(): product_name = row['product_name'] product_description = row['product_description'] product_pair = f"{product_name} {product_description}" # Compute similarity score match_probability = compute_similarity(new_product, product_pair, model, tokenizer) print(f"i: {index} product: {product_pair} score: {match_probability}") # If the similarity score is higher than the best score so far, update if match_probability > best_match_score: best_match_score = match_probability best_match = row # Keep the whole row for further details return best_match, best_match_score # Example: New product you want to find a match for new_product = "Jumbo Drinkyoghurt aardbei 0,5L" # Fetch all products from the database database_df = fetch_products_from_database() # Find the best match best_match, best_match_score = find_best_match(new_product, database_df, model, tokenizer) # Display the result if best_match is not None: print(f"Best match for {new_product}: {best_match['product_name']} with score: {best_match_score:.2f}") else: print("No match found.")