初入爬虫,自己确实对爬虫挺感兴趣,觉得爬到数据的那种感觉很开心。
有两点比较难,一是评分,二是票房,评分可以在这个地方取到,
票房采用的是自定义字体,难度较大,需要将其每个点的坐标下载下来对比–未解决!
import gzip
from urllib import request as r
import MySQLdb
from lxml import etree
from fake_useragent import UserAgent
conn = MySQLdb.connect(
host='***',
port=3306,
user="root",
password="***",
db="spider",
charset="utf8"
)
cursor = conn.cursor()
user_agent = UserAgent()
headers = {
'User-Agent': user_agent.random,
}
url = "https://maoyan.com/board/4?offset="
for i in range(10):
url = 'http://maoyan.com/board/4?offset='+str(i*10)
req = r.Request(url=url,headers=headers)
html = r.urlopen(req)
# print(html.read().decode())
html2 = html.read()
try:
s = gzip.decompress(html2).decode()
except:
s = html2.decode()
ele = etree.HTML(s)
movie_urls = ele.xpath("//dl[@class='board-wrapper']/dd/a/@href") # 100个电影的url
big_num = ele.xpath("//i[@class='integer']/text()")
small_num = ele.xpath("//i[@class='fraction']/text()")
for movie_url in movie_urls:
ind = movie_urls.index(movie_url)
movie_sorce = big_num[ind] + small_num[ind]
url = "https://maoyan.com"+movie_url
req2 = r.Request(url=url,headers=headers)
html3 = r.urlopen(req2)
html3 = html3.read().decode()
ele = etree.HTML(html3)
movie_cn_name = ele.xpath("//h3[@class='name']/text()") # 电影中文名字
movie_en_name = ele.xpath("//div[@class='ename ellipsis']/text()") # 电影中文名
s = ele.xpath("//div[@class='movie-brief-container']/ul/li/text()")
s1 = s[1].split("/")
print(movie_cn_name[0],movie_en_name[0],s[0],s[1],s[2],movie_sorce)
sql = "insert into mao_spider (movie_cn_name,movie_en_name,movie_type,movie_place,movie_show_time,movie_sorce,movie_time) values(%s,%s,%s,%s,%s,%s,%s)"
cursor.execute(sql,(movie_cn_name[0],movie_en_name[0],s[0],s1[0],s[2][:10],movie_sorce,s1[1]))
conn.commit()
print("添加成功!!!")
conn.rollback()
cursor.close()
conn.close()