ARB Security Solutions, LLC.

SharePoint Online Text Classification With PyTorch

Connect to SharePoint Online, extract text data from a list, load the data into a custom PyTorch dataset, train a simple DistilBERT model using the dataset, and use the trained model to predict the label for a new test text. Blah.

# Import required libraries
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from office365.runtime.auth.authentication_context import AuthenticationContext
from office365.sharepoint.client_context import ClientContext

# Set up SharePoint Online connection
url = 'https://yourcompany.sharepoint.com/sites/sitename'
username = '[email protected]'
password = 'yourpassword'
auth_context = AuthenticationContext(url)
auth_context.acquire_token_for_user(username, password)
ctx = ClientContext(url, auth_context)

# Get text data from a SharePoint Online list
list_name = 'TextList'
text_list = ctx.web.lists.get_by_title(list_name)
text_items = text_list.items
ctx.load(text_items)
ctx.execute_query()

# Extract text and labels from the list
texts = []
labels = []
for item in text_items:
    texts.append(item.Text)
    labels.append(item.Label)

# Define a custom dataset to load data into PyTorch
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        encoded_text = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return encoded_text['input_ids'], encoded_text['attention_mask'], label

# Initialize a DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Create an instance of your dataset and dataloader
dataset = TextDataset(texts, labels, tokenizer)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Define your model architecture
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.classifier = nn.Linear(768, 2)

# Define your loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Train your model
for epoch in range(10):
    running_loss = 0.0
    for i, batch in enumerate(dataloader):
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print('Epoch %d loss: %.3f' % (epoch + 1, running_loss / len(dataset)))

# Use your trained model for prediction
test_text = 'This is a test sentence.'
encoded_test_text = tokenizer.encode_plus(
    test_text,
    add_special_tokens=True,
    max_length=128,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors='pt'
)
with torch.no_grad():
    output = model(encoded_test_text['input_ids'], attention_mask=encoded_test_text['attention_mask'])
predicted_label = torch.argmax(output.logits).item()

# Print the predicted label
print('The predicted label is:', predicted_label)

Comments are closed.