Source: blog.csdn.net/lland5201314/article/details/117606268
Effect Display

Tool Preparation
-
Data Source: https://maoyan.com/board/4?offset=1
-
Development Environment:win10, python3.7
-
Development Tools:pycharm, Chrome
Project Idea Analysis
-
Movie Name
-
Movie Rating
-
Movie Link
-
Movie Type
-
Movie Release Location
-
Location
-
Movie Duration
-
Movie Duration




Tools Needed for Data Visualization
import pandas as pd
import numpy as np
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# get_ipython().run_line_magic('matplotlib', 'inline')
Effect Diagram Display






Source Code Display
Web Scraping Code
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2021年06月05日
# @File : demo4.py
import requests
from fake_useragent import UserAgent
from lxml import etree
import time
# Random request header
ua = UserAgent()
# Build request; you need to change this on the webpage. If you can't access, refresh the webpage and deal with the captcha.
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Cookie': '__mta=244176442.1622872454168.1622876903037.1622877097390.7; uuid_n_v=v1; uuid=6FFF6D30C5C211EB8D61CF53B1EFE83FE91D3C40EE5240DCBA0A422050B1E8C0; _csrf=bff9b813020b795594ff3b2ea3c1be6295b7453d19ecd72f8beb9700c679dfb4; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1622872443; _lxsdk_cuid=1770e9ed136c8-048c356e76a22b-7d677965-1fa400-1770e9ed136c8; _lxsdk=6FFF6D30C5C211EB8D61CF53B1EFE83FE91D3C40EE5240DCBA0A422050B1E8C0; ci=59; recentCis=59; __mta=51142166.1622872443578.1622872443578.1622876719906.2; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1622877097; _lxsdk_s=179dafd56bf-06d-403-d81%7C%7C12',
'User-Agent': str(ua.random)
}
def RequestsTools(url):
'''
Web scraping request tool function
:param url: Request address
:return: HTML object for xpath extraction
'''
response = requests.get(url, headers=headers).content.decode('utf-8')
html = etree.HTML(response)
return html
def Index(page):
'''
Homepage function
:param page: Page number
:return:
'''
url = 'https://maoyan.com/board/4?offset={}'.format(page)
html = RequestsTools(url)
# Detail page address suffix
urls_text = html.xpath('//a[@class="image-link"]/@href')
# Rating
pingfen1 = html.xpath('//i[@class="integer"]/text()')
pingfen2 = html.xpath('//i[@class="fraction"]/text()')
for i, p1, p2 in zip(urls_text, pingfen1, pingfen2):
pingfen = p1 + p2
urs = 'https://maoyan.com' + i
# Avoid excessive requests
time.sleep(2)
Details(urs, pingfen)
def Details(url, pingfen):
html = RequestsTools(url)
dianyan = html.xpath('//h1[@class="name"]/text()') # Movie name
leixing = html.xpath('//li[@class="ellipsis"]/a/text()') # Type
diqu = html.xpath('/html/body/div[3]/div/div[2]/div[1]/ul/li[2]/text()') # Total read
timedata = html.xpath('/html/body/div[3]/div/div[2]/div[1]/ul/li[3]/text()') # Time
for d, l, b, t in zip(dianyan, leixing, diqu, timedata):
countyr = b.replace('\n', '').split('/')[0] # Region
shichang = b.replace('\n', '').split('/')[1] # Duration
f = open('猫眼.csv', 'a')
f.write('{}, {}, {}, {}, {}, {}, {}\n'.format(d, pingfen, url, l, countyr, shichang, t))
print(d, pingfen, url, l, countyr, shichang, t )
for page in range(0, 11):
page *= 10
Index(page)
Visualization
#!/usr/bin/env python
# coding: utf-8
# Load commonly used data analysis libraries
import pandas as pd
import numpy as np
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# get_ipython().run_line_magic('matplotlib', 'inline')
# In[3]:
path='./maoyan.csv'
df=pd.read_csv(path,sep=',',encoding='utf-8',index_col=False)
df.drop(df.columns[0],axis=1,inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.head(10)
# View the structure of the data
df.info()
print(df.columns)
# In[11]:
# Year & Number of Movies Released The number of movies released after 2018 is uncertain, so we will exclude them for now
fig,ax=plt.subplots(figsize=(9,6),dpi=70)
df[df[u'上映时间']<2018][u'上映时间'].value_counts().sort_index().plot(kind='line',ax=ax)
ax.set_xlabel(u'Time (Year)')
ax.set_ylabel(u'Number of Releases')
ax.set_title(u'Release Time & Number of Movies Released')
# Based on the above graph, create another graph for release time, number of releases, and ratings
# However, due to the small amount of data before 1980, the ratings are inaccurate, so the main analysis area will focus on 1980-2017
x=df[df[u'上映时间']<2018][u'上映时间'].value_counts().sort_index().index
y=df[df[u'上映时间']<2018][u'上映时间'].value_counts().sort_index().values
y2=df[df[u'上映时间']<2018].sort_values(by=u'上映时间').groupby(u'上映时间').mean()[u'评分'].values
fig,ax=plt.subplots(figsize=(10,5),dpi=70)
ax.plot(x,y,label=u'Number of Releases')
ax.set_xlim(1980,2017)
ax.set_xlabel(u'Release Time')
ax.set_ylabel(u'Number of Releases')
ax.set_title(u'Release Time, Number of Releases, & Average Ratings')
ax2=ax.twinx()
ax2.plot(x,y2,c='y',ls='--',label=u'Ratings')
ax.legend(loc=1)
ax2.legend(loc=2)
# Resolve Chinese garbled characters and issues with negative values on the axes
plt.rcParams['font.sans-serif'] =['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
# In[12]:
# World & Release Time & Average Ratings
fig,ax=plt.subplots(figsize=(10,7),dpi=60)
df[df[u'评分']>0].groupby(u'上映时间').mean()[u'评分'].plot(kind='line',ax=ax)
ax.set_ylabel(u'Ratings')
ax.set_title(u'World & Release Time & Average Ratings')
# In[13]:
# Number of Movies by Genre Worldwide
# Split the genres into the smallest units and then count
types=[]
for tp in df[u'类型']:
ls=tp.split(',')
for x in ls:
types.append(x)
# In[14]:
# Movie Duration and Ratings Distribution
# Note: Some movies have not been rated, so we need to remove those
x=df[df[u'评分']>0].sort_values(by=u'时长(min)')[u'时长(min)'].values
y=df[df[u'评分']>0].sort_values(by=u'时长(min)')[u'评分'].values
fig,ax=plt.subplots(figsize=(9,6),dpi=70)
ax.scatter(x,y,alpha=0.6,marker='o')
ax.set_xlabel(u'Length (min)')
ax.set_ylabel(u'Number')
ax.set_title(u'Movie Length & Ratings Distribution')
# You can see the ratings
i=0
c0=[]
c1=[]
c2=[]
c3=[]
c4=[]
c5=[]
c6=[]
c7=[]
for x in df[u'地区']:
if u'中国大陆' in x:
c0.append(df.iat[i, 0])
c1.append(df.iat[i, 1])
c2.append(df.iat[i, 2])
c3.append(df.iat[i, 3])
c4.append(df.iat[i, 4])
c5.append(df.iat[i, 5])
c6.append(df.iat[i, 6])
c7.append(df.iat[i, 7])
i=i+1
china_df=pd.DataFrame({u'电影':c0, u'评分':c1,u'链接':c2, u'类型':c3,u'地区':c4, u'上映地点':c5,u'时长(min)':c6,u'上映时间':c7})
# In[16]:
# Comparison of Average Ratings between China & the World from 1980-2017
x1 = df[df[u'评分']>0].groupby(u'上映时间').mean()[u'评分'].index
y1 = df[df[u'评分']>0].groupby(u'上映时间').mean()[u'评分'].values
x2 = china_df[china_df[u'评分']>0].groupby(u'上映时间').mean()[u'评分'].index
y2 = china_df[china_df[u'评分']>0].groupby(u'上映时间').mean()[u'评分'].values
fig,ax=plt.subplots(figsize=(12,9),dpi=60)
ax.plot(x1,y1,ls='-',c='DarkTurquoise',label=u'World')
ax.plot(x2,y2,ls='--',c='Gold',label=u'China')
ax.set_title(u'Average Ratings Comparison between China & the World')
ax.set_xlabel(u'Time')
ax.set_xlim(1980,2017)
ax.set_ylabel(u'Ratings')
ax.legend()
# In[17]:
# Number of Movies by Genre Comparison between China & the World
# Since genres are mixed, to facilitate statistics, first write a function to split genres
# In[18]:
# Function to split genres; input a Series object of types, return a DataFrame of split types
# Here we pass a Series of types
def Cuttig_type(typeS):
types=[]
types1=[]
for x in typeS:
if len(x)<4:
# print x
types1.append(x)
ls=x.split(',')
for i in ls:
types.append(i)
types.extend(types1)
df=pd.DataFrame({u'类型':types})
return pd.DataFrame(df[u'类型'].value_counts().sort_values(ascending=False))
# In[19]:
# Comparison of Movie Types between China & the World
df1=Cuttig_type(china_df[u'类型'])
df2=Cuttig_type(df[u'类型'])
trans=pd.concat([df1,df2],axis=1)
trans.dropna(inplace=True)
trans.columns=[u'中国',u'世界']
fig,ax=plt.subplots(figsize=(15,9),dpi=80)
trans.plot(kind='bar',ax=ax)
fig.autofmt_xdate(rotation=30)
ax.set_title(u'Comparison of Movie Types between China & the World')
ax.set_xlabel(u'类型')
ax.set_ylabel(u'影片的数目')
# In[20]:
# Then there is the scatter distribution; comparison between China, the World, Duration, and Ratings
y = df[df[u'评分'] > 0].sort_values(by=u'时长(min)')[u'评分'].values
x = df[df[u'评分'] > 0].sort_values(by=u'时长(min)')[u'时长(min)'].values
y2 = china_df[china_df[u'评分'] > 0].sort_values(by=u'时长(min)')[u'评分'].values
x2 = china_df[china_df[u'评分'] > 0].sort_values(by=u'时长(min)')[u'时长(min)'].values
fig, ax = plt.subplots(figsize=(10,7), dpi=80)
ax.scatter(x, y, c='DeepSkyBlue', alpha=0.6, label=u'World')
ax.scatter(x2, y2, c='Salmon', alpha=0.7, label=u'China')
ax.set_title(u'Distribution of Ratings in China & the World')
ax.set_xlabel(u'Length (min)')
ax.set_ylabel(u'Ratings')
ax.legend(loc=4)
# In[25]:
dfs=df[(df[u'上映时间']>1980)&(df[u'上映时间']<2019)]
# for x in range(0,len(dfs)):
# print(dfs.iat[x,0],dfs.iat[x,-1])
df666 = dfs['电影'][:15]
wl = ",".join(df666.values)
# Write the segmented txt to a text file
# fenciTxt = open("fenciHou.txt","w+")
# fenciTxt.writelines(wl)
# fenciTxt.close()
# Set word cloud
wc = WordCloud(background_color="white", # Set background color
# mask=imread('shen.jpg'), # Set background image
# max_words=2000, # Set maximum number of words displayed
font_path="C:\Windows\Fonts\simkai.ttf", # Set to KaiTi Regular
# Set Chinese font to display (the default font for word clouds is "DroidSansMono.ttf", which does not support Chinese)
max_font_size=60, # Set maximum font size
random_state=30, # Set how many random generation states, i.e., how many color schemes
)
myword = wc.generate(wl) # Generate word cloud
wc.to_file('result.jpg')
# Display word cloud
plt.imshow(myword)
plt.axis("off")
plt.show()