导航菜单

自然语言处理/NLP实战案例
课程进度 80% · 第12/14章12/14章 · 标签 1/4
1

文本分类实战

使用深度学习模型进行新闻文本分类,实现自动新闻分类系统。

项目概述

  • 数据集:新闻文本数据集
  • 任务:多分类文本分类
  • 模型:BERT + 分类头
  • 评估指标:准确率、F1分数
2

实现代码

python
1
import torch
2
from transformers import BertTokenizer, BertForSequenceClassification
3
from torch.utils.data import Dataset, DataLoader
4
import pandas as pd
5
import numpy as np
6
 
7
class NewsDataset(Dataset):
8
def __init__(self, texts, labels, tokenizer, max_length=512):
9
self.texts = texts; self.labels = labels; self.tokenizer = tokenizer; self.max_length = max_length
10
def __len__(self): return len(self.texts)
11
def __getitem__(self, idx):
12
text, label = str(self.texts[idx]), self.labels[idx]
13
encoding = self.tokenizer(text, add_special_tokens=True, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
14
return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'labels': torch.tensor(label, dtype=torch.long)}
15
 
16
def train_model(model, train_loader, val_loader, device, epochs=3):
17
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
18
for epoch in range(epochs):
19
model.train(); total_loss = 0
20
for batch in train_loader:
21
input_ids = batch['input_ids'].to(device); attention_mask = batch['attention_mask'].to(device); labels = batch['labels'].to(device)
22
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
23
loss = outputs.loss; total_loss += loss.item()
24
loss.backward(); optimizer.step(); optimizer.zero_grad()
25
model.eval(); correct = 0; total = 0
26
with torch.no_grad():
27
for batch in val_loader:
28
input_ids = batch['input_ids'].to(device); attention_mask = batch['attention_mask'].to(device); labels = batch['labels'].to(device)
29
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
30
_, predicted = torch.max(outputs.logits, 1); total += labels.size(0); correct += (predicted == labels).sum().item()
31
print(f'Epoch {epoch+1}: Loss={total_loss/len(train_loader):.4f}, Acc={100*correct/total:.2f}%')
32
 
33
def main():
34
df = pd.read_csv('news_dataset.csv')
35
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
36
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=len(set(df['category'])))
37
dataset = NewsDataset(df['text'].values, df['category'].values, tokenizer)
38
train_size = int(0.8 * len(dataset))
39
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, len(dataset)-train_size])
40
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
41
val_loader = DataLoader(val_dataset, batch_size=16)
42
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
43
model.to(device); train_model(model, train_loader, val_loader, device)
44
torch.save(model.state_dict(), 'news_classifier.pth')