废话不多说直接上代码
import re
import requests
from bs4 import BeautifulSoup
import json
import pymysql
'''
请求头
'''
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/79.0.3945.117 Safari/537.36',
'Cookie': 'll="118211"; bid=c70emUk_m4M; _vwo_uuid_v2=D8F7DA5E2D50D80663D940CCE27B6F8E9|f3c85aa9f98e5dac3c9e14475ad2a413; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1581078016%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DRdS0q-qowQfeRe2yVGFseDlMeT2o5KBfB36iwvcYWvfjhVJCSlLzJdsT9Sqj3myF%26wd%3D%26eqid%3D811705580017458a000000065e3d5608%22%5D; _pk_id.100001.4cf6=5e696c40df043e28.1580813998.2.1581078016.1580814010.; _pk_ses.100001.4cf6=*; ap_v=0,6.0; dbcl2="209385526:kX0erbaRrwM"; ck=NBuz; __utma=30149280.926421091.1580813987.1580813987.1581078143.2; __utmb=30149280.0.10.1581078143; __utmc=30149280; __utmz=30149280.1581078143.2.2.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/passport/login; __utma=223695111.285887900.1580813987.1580813987.1581078143.2; __utmb=223695111.0.10.1581078143; __utmc=223695111; __utmz=223695111.1581078143.2.2.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/passport/login'
}
def get_html(url):
"""
get html content
获取html文本
:param url:
:return:
"""
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
if response.status_code == 200:
html = response.content
return str(html, 'utf-8')
else:
return None
def parse(url, count):
"""
parse html content get jsonString
解析html文本递归获取每一页评论信息
:param url:
:param count:
:return:
"""
html = get_html(url)
soup = BeautifulSoup(html, 'lxml')
nextHref = soup.find('div', id='paginator')
if nextHref is None:
return count
divList = soup.select("div.comment-item")
for div in divList:
'''
1. 豆瓣评分用户名 douBanUserName
2. 豆瓣用户评分星数 douBanGradeStar
3. 豆瓣短评有用数 douBanGradeUserFluNumber
4. 豆瓣短评内容 douBanGradeContent
'''
douBanUserName = div.select_one("h3 span.comment-info a").text
douBanGradeStar = div.select_one("h3 span.comment-info span.rating")
if douBanGradeStar is not None:
douBanGradeStar = parseDouBanGradeStar(douBanGradeStar['title'])
else:
douBanGradeStar = None
douBanGradeUserFluNumber = div.select_one("h3 span.votes").text
douBanGradeContent = div.select_one("p span.short").text
data = {'douBanUserName': douBanUserName, 'douBanGradeStar': douBanGradeStar,
'douBanGradeUserFluNumber': douBanGradeUserFluNumber, 'douBanGradeContent': douBanGradeContent}
writerMySql(data)
count += 1
aList = nextHref.select("a")
for a in aList:
text = a.text
if text == '后页 >':
nextHref = a
else:
nextHref = None
if nextHref is None:
return count
nextHref = nextHref['href']
mainIndex = splitUrlString(url)
nextHref = mainIndex + nextHref
print('已写入', count, '条')
print(nextHref)
parse(nextHref, count)
def writerMySql(data):
"""
将Json字符串数据写入MySql数据库
:param data:
:return:
"""
if data is None:
return
db = pymysql.connect(host="localhost", user="root",
password="root", db="douban", port=3306)
cursor = db.cursor()
table = 'doubangradetable'
keys = ', '.join(data.keys())
values = ', '.join(['%s'] * len(data))
sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values)
try:
cursor.execute(sql, tuple(data.values()))
db.commit()
print("commit")
except:
db.rollback()
cursor.close()
db.close()
def parseDouBanGradeStar(string):
"""
处理评分字符串为星数
:param string:
:return:
"""
strings = {
'很差': 1,
'较差': 2,
'还行': 3,
'推荐': 4,
'力荐': 5,
}
return strings.get(string, None)
def splitUrlString(url_):
"""
分割url
:param url_:
:return:
"""
start = url_.find('?')
url_ = url_[0: start]
return url_
if __name__ == '__main__':
print('请输入要搜索的电影或者电视剧:', end='')
videoName = input()
print('正在努力匹配中...')
initial = 'https://movie.douban.com/j/subject_suggest?q={}'.format(videoName)
html = get_html(initial)
if html is not None:
print('匹配成功!')
loads = json.loads(html)
videoCount = len(loads)
print('共有:', videoCount, '条结果')
for load in loads:
videoName = str(load.get('title')).replace(' ', '')
print(videoName, ' 是否写入数据库表中(1/0):', end=' ')
s = int(input())
if s == 1:
id = load.get('id')
url = 'https://movie.douban.com/subject/{}/comments?' \
'start=0&limit=20&sort=new_score&status=P'.format(id)
print('开始写入...')
parse(url, 0)
else:
print('匹配失败!')