Connect to SharePoint Online and extract customer feedback data from a SharePoint Online list named CustomerFeedback
. The feedback and corresponding sentiment labels are then loaded into a custom PyTorch dataset, which is used to train a DistilBERT-based sentiment analysis model. The trained model is then used to predict the sentiment labels for a test dataset consisting of three sample feedback statements.
# Import required libraries
from office365.runtime.auth.authentication_context import AuthenticationContext
from office365.sharepoint.client_context import ClientContext
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
# Set up SharePoint Online connection
url = 'https://yourcompany.sharepoint.com/sites/sitename'
username = '[email protected]'
password = 'yourpassword'
auth_context = AuthenticationContext(url)
auth_context.acquire_token_for_user(username, password)
ctx = ClientContext(url, auth_context)
# Get data from SharePoint Online list
list_name = 'CustomerFeedback'
feedback_list = ctx.web.lists.get_by_title(list_name)
feedback_items = feedback_list.items
ctx.load(feedback_items)
ctx.execute_query()
# Extract feedback and sentiment label from the list
feedback = []
sentiment_label = []
for item in feedback_items:
feedback.append(item.Feedback)
sentiment_label.append(item.Sentiment)
# Define a custom dataset to load data into PyTorch
class FeedbackDataset(Dataset):
def __init__(self, feedback, sentiment_label, tokenizer):
self.feedback = feedback
self.sentiment_label = sentiment_label
self.tokenizer = tokenizer
def __len__(self):
return len(self.feedback)
def __getitem__(self, index):
text = self.feedback[index]
label = self.sentiment_label[index]
encoding = self.tokenizer(text, truncation=True, padding=True, return_tensors='pt')
return {'input_ids': encoding['input_ids'][0], 'attention_mask': encoding['attention_mask'][0], 'labels': torch.tensor(label)}
# Define your model architecture
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
# Create an instance of your dataset and dataloader
dataset = FeedbackDataset(feedback, sentiment_label, tokenizer)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
# Define your loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)
# Train your model
for epoch in range(3):
running_loss = 0.0
for i, batch in enumerate(dataloader):
optimizer.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
loss.backward()
optimizer.step()
running_loss += loss.item()
print('Epoch %d loss: %.3f' % (epoch + 1, running_loss / len(dataset)))
# Use your trained model for prediction
test_data = ['I love this product', 'I hate this product', 'This product is okay']
test_encoding = tokenizer(test_data, truncation=True, padding=True, return_tensors='pt')
test_input_ids = test_encoding['input_ids'].to(device)
test_attention_mask = test_encoding['attention_mask'].to(device)
test_output = model(test_input_ids, attention_mask=test_attention_mask)
test_predictions = torch.argmax(test_output.logits, axis=1).tolist()
# Print the predictions
print(test_predictions)