Real Python Order Record

Click on "Python Headlines" and select "Star🔝"

Click "Read Original" at the end to unlock the materials!

Last weekend I received an order for 1200, the customer service took a 10% commission, so I received 1000. I completed it in two hours, feeling very happy. Such orders are actually not common, with low technical difficulty but high price, we commonly call it a “fish-catching order”. I thought about treating the goddess to a meal after making money, but was mercilessly rejected!

Effect Display

Tool Preparation

Data Source: https://maoyan.com/board/4?offset=1
Development Environment: Win10, Python 3.7
Development Tools: PyCharm, Chrome

Project Thought Analysis

First, collect all the movie information from Maoyan. Here, we take Maoyan’s top 100 list as an example to obtain movie information:

Movie Name
Movie Rating
Movie Link
Movie Type
Movie Release Location
Location
Movie Duration
Movie Duration

Parse web data information

Parse the redirect link of the homepage

The rating on the Maoyan detail page is encrypted, so we directly extract the rating information from the homepage.

Extract data from the detail page

Save the data in a CSV table for easy data visualization later

Tools Needed for Data Visualization

import pandas as pd
import numpy as np
import jiebafrom wordcloud import WordCloud
import matplotlib.pyplot as plt
# get_ipython().run_line_magic('matplotlib', 'inline')

Effect Picture Display

Source Code Display:

Web Crawler:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : June 5, 2021
# @File    : demo4.py
import requests
from fake_useragent import UserAgent
from lxml import etree
import time

# Random request header
ua = UserAgent()

# Build request, you need to go to the webpage to change it yourself if you can't request it, refresh the webpage to solve the captcha
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Cookie': '__mta=244176442.1622872454168.1622876903037.1622877097390.7; uuid_n_v=v1; uuid=6FFF6D30C5C211EB8D61CF53B1EFE83FE91D3C40EE5240DCBA0A422050B1E8C0; _csrf=bff9b813020b795594ff3b2ea3c1be6295b7453d19ecd72f8beb9700c679dfb4; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1622872443; _lxsdk_cuid=1770e9ed136c8-048c356e76a22b-7d677965-1fa400-1770e9ed136c8; _lxsdk=6FFF6D30C5C211EB8D61CF53B1EFE83FE91D3C40EE5240DCBA0A422050B1E8C0; ci=59; recentCis=59; __mta=51142166.1622872443578.1622872443578.1622876719906.2; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1622877097; _lxsdk_s=179dafd56bf-06d-403-d81%7C%7C12',
    'User-Agent': str(ua.random)
}

def RequestsTools(url):
    '''
    Crawler request tool function
    :param url: request address
    :return: HTML object for xpath extraction
    '''
    response = requests.get(url, headers=headers).content.decode('utf-8')
    html = etree.HTML(response)
    return html

def Index(page):
    '''
    Homepage function
    :param page: page number
    :return:
    '''
    url = 'https://maoyan.com/board/4?offset={}'.format(page)
    html = RequestsTools(url)
    # Detail page address suffix
    urls_text = html.xpath('//a[@class="image-link"]/@href')
    # Rating
    pingfen1 = html.xpath('//i[@class="integer"]/text()')
    pingfen2 = html.xpath('//i[@class="fraction"]/text()')
    for i, p1, p2 in zip(urls_text, pingfen1, pingfen2):
        pingfen = p1 + p2
        urs = 'https://maoyan.com' + i
        # Requests are too frequent
        time.sleep(2)
        Details(urs, pingfen)

def Details(url, pingfen):
    html = RequestsTools(url)
    dianyan = html.xpath('//h1[@class="name"]/text()') # Movie name
    leixing = html.xpath('//li[@class="ellipsis"]/a/text()') # Type
    diqu = html.xpath('/html/body/div[3]/div/div[2]/div[1]/ul/li[2]/text()') # Read total
    timedata = html.xpath('/html/body/div[3]/div/div[2]/div[1]/ul/li[3]/text()') # Time
    for d, l, b, t in zip(dianyan, leixing, diqu, timedata):
        countyr = b.replace('\n', '').split('/')[0] # Area
        shichang = b.replace('\n', '').split('/')[1] # Duration
        f = open('猫眼.csv', 'a')
        f.write('{}, {}, {}, {}, {}, {}, {}\n'.format(d, pingfen, url, l, countyr, shichang, t))
        print(d, pingfen, url, l, countyr, shichang, t)

for page in range(0, 11):
    page *= 10
    Index(page)

Visualization

#!/usr/bin/env python
# coding: utf-8
# Load commonly used data analysis libraries
import pandas as pd
import numpy as np
import jiebafrom wordcloud import WordCloud
import matplotlib.pyplot as plt
# get_ipython().run_line_magic('matplotlib', 'inline')

# In[3]:

path='./maoyan.csv'
df=pd.read_csv(path,sep=',',encoding='utf-8',index_col=False)
df.drop(df.columns[0],axis=1,inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.head(10)


# View data structure
df.info()
print(df.columns)


# In[11]:

# Year &amp; Number of Movies Released  The number of movies released after 2018 is just what Maoyan has published, which is uncertain. Let's first remove those after 2018.
fig,ax=plt.subplots(figsize=(9,6),dpi=70)
df[df[u'上映时间']&lt;2018][u'上映时间'].value_counts().sort_index().plot(kind='line',ax=ax)
ax.set_xlabel(u'Time (Year)')
ax.set_ylabel(u'Number Released')
ax.set_title(u'Release Time &amp; Number of Movies Released')


# Based on the above chart, let's create a chart showing the relationship between release time, number of releases, and ratings.
# However, since the amount of data before 1980 is small and the ratings are inaccurate, the main analysis area is concentrated from 1980 to 2017.
x=df[df[u'上映时间']&lt;2018][u'上映时间'].value_counts().sort_index().index
y=df[df[u'上映时间']&lt;2018][u'上映时间'].value_counts().sort_index().values
y2=df[df[u'上映时间']&lt;2018].sort_values(by=u'上映时间').groupby(u'上映时间').mean()[u'评分'].values
fig,ax=plt.subplots(figsize=(10,5),dpi=70)
ax.plot(x,y,label=u'Number Released')
ax.set_xlim(1980,2017)
ax.set_xlabel(u'Release Time')
ax.set_ylabel(u'Number Released')
ax.set_title(u'Average Ratings &amp; Number Released Over Time')
ax2=ax.twinx()
ax2.plot(x,y2,c='y',ls='--',label=u'Ratings')
ax.legend(loc=1)
ax2.legend(loc=2)

# Solve Chinese garbled characters and the issue of negative values not being displayed on the axes.
plt.rcParams['font.sans-serif'] =['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False

# In[12]:

# World &amp; Release Time &amp; Average Ratings
fig,ax=plt.subplots(figsize=(10,7),dpi=60)
df[df[u'评分']&gt;0].groupby(u'上映时间').mean()[u'评分'].plot(kind='line',ax=ax)
ax.set_ylabel(u'Ratings')
ax.set_title(u'World &amp; Release Time &amp; Average Ratings')

# In[13]:

# Number of Movies by Type Worldwide
# Cut the types into the smallest unit and then count.
types=[]
for tp in df[u'类型']:
    ls=tp.split(',')
    for x in ls:
        types.append(x)
tp_df=pd.DataFrame({u'类型':types})
fig,ax=plt.subplots(figsize=(9,6),dpi=60)
tp_df[u'类型'].value_counts().plot(kind='bar',ax=ax)
ax.set_xlabel(u'Type')
ax.set_ylabel(u'Number')
ax.set_title(u'World &amp; Type &amp; Number')

# In[14]:

# Distribution of Movie Duration &amp; Ratings
# There is an issue: In fact, some movies have not been rated, we need to eliminate these.
x=df[df[u'评分']&gt;0].sort_values(by=u'时长(min)')[u'时长(min)'].values
y=df[df[u'评分']&gt;0].sort_values(by=u'时长(min)')[u'评分'].values
fig,ax=plt.subplots(figsize=(9,6),dpi=70)
ax.scatter(x,y,alpha=0.6,marker='o')
ax.set_xlabel(u'Duration (min)')
ax.set_ylabel(u'Number')
ax.set_title(u'Movie Duration &amp; Ratings Distribution')
# You can see the ratings.


# ...

# In[41]:

Conclusion

The source code has been provided, so I won’t analyze the source code. If it is useful to everyone, please give a thumbs up, thank you very much. Finally, let me show you the process of my order taking.

PS: Always use a third-party platform for order taking and photography!!!

PS: Always use a third-party platform for order taking!!!PS: Always use a third-party platform for order taking!!!PS: Always use a third-party platform for order taking!!!

Copyright Statement: This article is an original article by the blogger, following the CC 4.0 BY-SA copyright agreement. Please attach the original source link and this statement when reprinting. Article link:

https://blog.csdn.net/lland5201314/article/details/117606268