使用NLP技术进行京东商品评论情感分析
# 获取数据路径及文件
import os
all_file_path = []
for root, dirs, files in os.walk('data'):
for dir in dirs:
all_file_path.append(os.path.join(root, dir, 'neg.txt'))
all_file_path.append(os.path.join(root, dir, 'pos.txt'))
# 数据预处理及分词
import jieba
import pandas as pd
data = []
for file_path in all_file_path:
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
words = jieba.lcut(line.strip())
data.append([' '.join(words), file_path.split(os.path.sep)[-2]])
df = pd.DataFrame(data, columns=['comment', 'category'])
# 文本向量化
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df['comment'])
# 情感分析
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['category'], random_state=42)
lr = LogisticRegression()
lr.fit(X_train, y_train)
print(f"情感分析在测试集上的准确率为:{accuracy_score(y_test, lr.predict(X_test))}")
下载地址
用户评论