本次通過貓眼電影,對春節賀歲大片【滿江紅】進行數據分析。而本次我們通過動態接口形式獲取評論信息,靜態html解析需要額外的字體解析,網上的教程也已經很全了,有興趣的小伙伴們也可以多多沖浪或和本人探討哈!
滿江紅影圖
一、 接口分析
1. 目標站點:貓眼H5
接口列表
2. 通過滑動查看評論信息,或點擊評論進入評論子頁面滑動,即可抓取到相關接口(瀏覽器F12工具中只能抓取到子評論接口,如果要整個評論的需要抓包工具配合或使用手機抓包)
接口詳情
3. 評論接口(已加密處理)
aHR0cHM6Ly9tLm1hb3lhbi5jb20vYXBvbGxvL2Fwb2xsb2FwaS9tbWRiL3JlcGxpZXMvY29tbWVudC8xMTY3MTI5MDg5Lmpzb24/X3ZfPXllcyZvZmZzZXQ9NDA=
二、 響應分析
- 通過子評論接口,可以分析出來相關字段(昵稱、性別、評分、評論內容、評論點贊量、用戶等級等)
{
"cmts": [
{
"Approve": 0,
"assistAwardInfo": {
"avatar": "",
"celebrityId": 0,
"celebrityName": "",
"rank": 0,
"title": ""
},
"avatarurl": "https://img.meituan.NET/maoyanuser/e6f7600fa2980a929accb602fde5abaa2776.jpg",
"channelId": 70001,
"content": "在電影院看真的很有氛圍!背景音樂也很加分",
"deleted": false,
"id": 1171602285,
"ipLocName": "福建",
"nickName": "腿小菇",
"time": "2023-02-27 10:24",
"userId": 1322748722,
"userLevel": 3,
"vipInfo": "",
"vipType": 0
}
],
"ocm": {
"approve": 8657,
"approved": false,
"assistAwardInfo": {
"avatar": "",
"celebrityId": 0,
"celebrityName": "",
"rank": 0,
"title": ""
},
"authInfo": "",
"avatarurl": "https://img.meituan.net/avatar/66fb6e3ef190201864c732a03b5d9be924014.jpg",
"content": "剛看完滿江紅,真的好看,這是我看過最值的一部電影,反轉反轉再反轉,真的是永遠想不到下一步是什么,而且還很搞笑,搞笑又宏偉,真的描述不出來這個電影的好,都給我去看!滿江紅!入股不虧?。。。?quot;,
"id": 1167129089,
"ipLocName": "遼寧",
"isMajor": false,
"juryLevel": 0,
"majorType": 0,
"mvid": 1462626,
"nick": "Gpc126688235",
"nickName": "Gpc126688235",
"oppose": 0,
"pro": false,
"reply": 680,
"score": 5,
"spoiler": 0,
"supportComment": true,
"supportLike": true,
"sureViewed": 1,
"tagList": {
"fixed": [
{
"id": 1,
"name": "購票好評"
},
{
"id": 4,
"name": "購票"
},
{
"id": 6,
"name": "優質評價"
}
]
},
"time": "2023-01-22 12:19",
"userId": 3164097169,
"userLevel": 2,
"videoDuration": 0,
"vipInfo": "",
"vipType": 0
},
"total": 60
}
2. 完整comment接口響應示例
{
"data": {
"hotIds": [
1167280609,
1167187803
],
"total": 16521,
"comments": [
{
"avatarUrl": "https://img.meituan.net/maoyanuser/80cdf9a184d40eb9ecc0e5d170f3e45d11928.png",
"buyTicket": false,
"channelId": 3,
"content": "還行吧,沒有看開心 ",
"delete": false,
"follow": false,
"gender": 1,
"id": 1171756165,
"imageUrls": [],
"ipLocName": "山東",
"likedByCurrentUser": false,
"major": false,
"movie": {
"id": 0,
"sc": 0
},
"movieId": 1462626,
"nick": "淘嘉豪",
"replyCount": 0,
"score": 9,
"showApprove": false,
"showVote": false,
"spoiler": false,
"startTime": "1677923460000",
"tagList": [
{
"id": 1,
"name": "購票好評"
},
{
"id": 4,
"name": "購票"
}
],
"time": 1677923460000,
"ugcType": 11,
"upCount": 0,
"userId": 71317227,
"userLevel": 2,
"vipType": 0
},
],
"t2total": 0,
"myComment": {}
},
"paging": {},
"ts": 1677956823197
}
三、數據解析
- 構造請求頭,模擬數據請求
def get_film_data(offset = 0, filename="film"):
url = f'aHR0cHM6Ly9tLm1hb3lhbi5jb20vYXBvbGxvL2Fwb2xsb2FwaS9tbWRiL3JlcGxpZXMvY29tbWVudC8xMTY3MTI5MDg5Lmpzb24/X3ZfPXllcyZvZmZzZXQ9NDA='
headers = {
'User-Agent': 'Mozilla/5.0 (iphone; CPU iPhone OS 11_0 like mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
}
cookies = {
'uuid_n_v':'v1',
'iuuid':'942C12B0DF4311E9ADA9C1C3B540BA45F066B2B3028841B8A0BC3544E4C0AD17',
'ci':'1%2C%E5%8C%97%E4%BA%AC',
'_lxsdk_cuid':'16d6c9b401ec8-0c6c86354bd8a9-5b123211-100200-16d6c9b401ec8',
'webp':'true',
'_lxsdk':'942C12B0DF4311E9ADA9C1C3B540BA45F066B2B3028841B8A0BC3544E4C0AD17'
}
# 開始頁面請求,返回響應內容
response = requests.get(url,headers=headers,cookies=cookies).json()
# 總評論數
total = response['total']
print(total)
# 評論信息列表
cmts = response['cmts']
pprint(cmts)
for comment in cmts:
data = []
# 評論id
# id = comment['id']
# 評論內容
content = comment['content']
# 用戶昵稱
nickName = comment['nickName']
# 用戶評分
score = comment['score']
# 評論時間
# startTime = comment['time']
# 用戶id
userId = comment['userId']
# 用戶等級
userLevel = comment['userLevel']
# 用戶性別
gender = comment.get('gender',None)
data['nickName '] = nickName
data['gender'] = gender
data['score'] = score
data['content'] = content
data['userId '] = userId
data['userLevel'] = userLevel
save_data_csv(data,filename)
return total
2. 數據存儲(這里為以csv演示)
def save_data_csv(data, file_name):
with open(file_name,'a',encoding='utf-8-sig',newline='')as fp:
# 創建寫對象
writer = csv.writer(fp)
title = ['nickName ','gender','score','content','userId ','userLevel']
# 解決循環存儲,表頭重復問題
with open(file_name,'r',encoding='utf-8-sig',newline='')as fp:
# 創建讀對象
reader = csv.reader(fp)
if not [row for row in reader]:
writer.writerow(title)
writer.writerow([data[i] for i in title])
else:
writer.writerow([data[i] for i in title])
print('*'*10+'保存完畢'+'*'*10)
影評結果
四、數據可視化
- 影評分詞
def wordcloud_analysis(file_name):
df = pd.read_csv(file_name, encoding='utf-8')
content = df['content'].to_string()
# 開始分詞 使用jieba進行精確分詞獲取詞語列表
words = jieba.lcut(content)
# 使用空格拼接獲得字符串
words = ' '.join(words)
# 生成詞云
# 讀取圖片,生成圖片形狀
mask_pic = np.array(Image.open('1.jpg'))
words_cloud = WordCloud(
background_color='white', # 詞云圖片的背景顏色
width=800, height=600, # 詞云圖片的寬度,默認400像素;詞云圖片的高度,默認200像素
font_path='msyh.ttf', # 詞云指定字體文件的完整路徑
max_words=200, # 詞云圖中最大詞數,默認200
max_font_size=80, # 詞云圖中最大的字體字號,默認None,根據高度自動調節 min_font_size# 詞云圖中最小的字體字號,默認4號
font_step=1, # 詞云圖中字號步進間隔,默認1
random_state=30, # 設置有多少種隨機生成狀態,即有多少種配色方案
mask=mask_pic # 詞云形狀,默認None,即方形圖
).generate(words) # 有jieba分詞拼接的字符串生成詞云
words_cloud.to_file('comment.png') # 保存詞云為圖片
# 使用plt顯示詞云
plt.imshow(words_cloud, interpolation='bilinear')
# 消除坐標軸
plt.axis('off')
plt.show()
分詞
2. 觀看人群性別及評分占比分析(由于取得部分數據,不代表最終現實結果,勿糾)
def gender_pie_analysis(file_name):
df = pd.read_csv(file_name, encoding='utf-8')
print(df)
#
# # 1.觀看人群性別
gender = df['gender'].value_counts()
print(gender)
# 餅圖,標題:觀看人群性別占比
# 調用自定義餅圖函數
# 創建畫布和軸
fig, ax = plt.subplots(figsize=(6, 6), dpi=100)
# plt.figure()
size = 0.5
# labels = data.index
ax.pie(gender, labels=['女','男','未知'], startangle=90, autopct='%.1f%%'
, colors=sns.color_palette('husl', len(gender)),
radius=1, # 餅圖半徑,默認為1
pctdistance=0.75, # 控制百分比顯示位置
wedgeprops=dict(width=size, edgecolor='w'), # 控制甜甜圈的寬度
textprops=dict(fontsize=10) # 控制字號及顏色
)
ax.set_title("【滿江紅】觀看人群性別占比", fontsize=15)
# plt.title(title)
plt.show()
性別占比
評分占比
3. 用戶等級分析
def user_level_bar_analysis(file_name):
df = pd.read_csv(file_name, encoding='utf-8')
print(df)
userLevel = df['userLevel'].value_counts().sort_index()
print(userLevel)
x = userLevel.index
y = userLevel
fig, ax = plt.subplots()
plt.bar(x, y, color='#DE85B5')
# 柱狀圖標題
plt.title('評論用戶等級數量分布柱狀圖')
plt.grid(True, axis='y', alpha=1)
for i, j in zip(x, y):
plt.text(i, j, '%d' % j, horizontalalignment='center', )
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.show()
等級數量分布
該篇文章只是從評分角度去做的數據分析,其實還可以從影視類型、年度電影Top、票房等角度進一步做數據分析。
該篇文章來自本人知乎號:梓羽Python/ target=_blank class=infotextkey>Python
文章鏈接:
https://zhuanlan.zhihu.com/p/611295606