一、爬取王者荣耀英雄信息(单页)
import json
import pymysql
import requests
from lxml import etree
def get_heros(url):
response = requests.get(url)
response.encoding = 'GBK' # 王者荣耀官网编码为gbk,pycharm默认utf-8,故中文乱码
html_etree = etree.HTML(response.text)
return html_etree
def extract_heros(html_etree):
heros_list = html_etree.xpath('//ul[contains(@class,"herolist clearfix")]/li/a')
base_url = 'https:'
heros = []
for hero in heros_list:
hero_img = base_url + hero.xpath('./img/@src')[0]
hero_name = hero.xpath('./img/@alt')[0]
hero_info = {
'hero_img': hero_img,
'hero_name': hero_name
}
heros.append(hero_info)
return heros
def save_heros_as_json(heros):
hero_json = json.dumps(heros, ensure_ascii=False) # 默认以ascii解析,而中文不在ascii编码中,可以在文件中显示中文
with open('hero.json', 'a') as w:
w.write(hero_json)
w.flush()
def save_heros_to_db(heros):
conn = pymysql.Connect(host='localhost',user='root',password='6666',port=3306,database='mydb1')
cursor = conn.cursor()
for hero in heros:
cursor.execute('insert into hero(heroname,heroimg) values("%s","%s")'%(hero.get('hero_name'),hero.get('hero_img')))
cursor.close()
conn.close()
if __name__ == '__main__':
url = 'https://pvp.qq.com/web201605/herolist.shtml'
html_etree = get_heros(url)
heros = extract_heros(html_etree)
# save_heros_as_json(heros)
save_heros_to_db(heros)
注意点:
1.王者荣耀官方的页面编码为GBK。与pycharm默认的编码格式utf-8 不同需要将获取的页面内容 添加 response.encoding = ‘GBK’
2.json序列化默认是以ascii编码解析,而中文不在ascii 编码中,所以存储的json文件无法显示中文,添加hero_json = json.dumps(heros, ensure_ascii=False)即可显示中文
3. 有几个英雄信息无法爬取,因为其是动态加载
二、爬取糗事百科段子并存储(多页),无需登录
import json
import pymysql
import requests
from lxml import etree
def get_jokes(url):
"""输入url,输出该网页的python树形结构"""
# 判断url是否为空
if not url:
exit()
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
}
response = requests.get(url, headers=headers)
html_etree = etree.HTML(response.text)
return html_etree
def extract_jokes(html_etree):
"""提取网页信息"""
jokes_lists = html_etree.xpath('//div[contains(@class,"article block")]') # 匹配所有笑话,列表类型
jokes = []
for jokes_list in jokes_lists: # 迭代,取出每一个笑话
author = extract_text(jokes_list.xpath('./div/a[contains(@onclick,"_hmt.push")]/h2/text()'))
content = extract_text(jokes_list.xpath('./a/div[contains(@class,"content")]/span/text()'))
jokes_dict = {
'author': author,
'content': content
}
jokes.append(jokes_dict)
return jokes
def extract_text(text_list):
"""数据清洗"""
return "".join(text_list).strip()
def save_jokes_as_json(jokes,page):
data = {
'status': 'ok',
'code': 200,
'data': jokes
}
data_json = json.dumps(data, ensure_ascii=False)
with open('%d.json'%page, 'a', encoding='utf-8') as w:
w.write(data_json)
w.flush()
def save_jokes_to_db(jokes_list):
for joke in jokes_list:
# 提供服务器的认证信息,发起连接请求
conn = pymysql.Connect(host="localhost", port=3306, user="root", password="6666", database="mydb1")
cursor = conn.cursor()
print(joke.get("content"))
cursor.execute("insert into joke(author, joke_content) VALUES ('%s', '%s');" % (joke.get("author"), joke.get("content")))
# MySQL中存在事务,事务才需要提交 pymysql默认开启了事务
conn.commit()
cursor.close()
conn.close()
def get_next_url(html_etree):
next_page_info = html_etree.xpath('//ul[contains(@class,"pagination")]/li/a/span[contains(@class,"next")]')
if not next_page_info:
print('最后一页')
exit()
base_url = 'https://www.qiushibaike.com'
new_url = html_etree.xpath('//ul[contains(@class,"pagination")]/li[last()]/a/@href')[0]
return base_url + new_url
if __name__ == '__main__':
url = 'https://www.qiushibaike.com/text/'
page = 1
while url:
print(url)
html_etree = get_jokes(url)
jokes = extract_jokes(html_etree)
save_jokes_as_json(jokes,page)
# save_jokes_to_db(jokes)
new_url = get_next_url(html_etree)
url = new_url
page+=1
三、爬取新片厂视频信息(登录问题,爬取多页)
问题描述:新片厂视频前20页无需登录,超过二十页需要登录。
import json
import pymysql
import requests
from lxml import etree
def get_movies(url):
headers = {
'Cookie': 'Device_ID=5f86ec3183da2; Authorization=BD428C0D1D8684D391D8684D941D868B7F61D8683179D33964B2; _ga=GA1.2.1623768300.1602677809; _gid=GA1.2.2021033802.1602677809; UM_distinctid=1752720b8cb152-0c05a736a091e4-333376b-144000-1752720b8cc65e; PHPSESSID=3hfukfc3er7sqtcm9i0g9fh3s7; SERVER_ID=b52601c8-backend-jeatmlpn; Hm_lvt_dfbb354a7c147964edec94b42797c7ac=1602730066,1602755676,1602812945,1602836392; CNZZDATA1262268826=1592050288-1602675873-%7C1602835419; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22175270ab305301-007370f4b07bc1-333376b-1327104-175270ab3062c6%22%2C%22%24device_id%22%3A%22175270ab305301-007370f4b07bc1-333376b-1327104-175270ab3062c6%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _gat=1; channel_page=apc%3D; Hm_lpvt_dfbb354a7c147964edec94b42797c7ac=1602838892; cn_1262268826_dplus=%7B%22distinct_id%22%3A%20%221752720b8cb152-0c05a736a091e4-333376b-144000-1752720b8cc65e%22%2C%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201602838919%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201602838919%7D'
}
response = requests.get(url,headers=headers)
html_etree = etree.HTML(response.text)
return html_etree
def extract_movies(html_etree):
movie_list = html_etree.xpath('//div[contains(@class,"channel-con")]/ul/li')
movies = []
for movie in movie_list:
movie_name = movie.xpath('./div/div[contains(@class,"video-con-top")]/a/p/text()')[0]
movie_num = movie.xpath(
'./div/div[contains(@class,"video-con-top")]/div[contains(@class,"video-view")]/span[contains(@class,"fw_300 icon")]/text()')[
0]
movie_img = movie.xpath('./a/img/@_src')[0] # 图片懒加载,渲染前后属性值不同
movie_dict = {
'moviename': movie_name,
'movienum': movie_num,
'movie_img': movie_img
}
movies.append(movie_dict)
return movies
def save_movies_to_db(movies):
for movie in movies:
conn = pymysql.Connect(host='localhost', user='root', password='6666', port=3306, database='mydb1')
cursor = conn.cursor()
cursor.execute('insert into movie(moviename,movienum,movieimg) values("%s","%s","%s")' % (
movie.get('moviename'), movie.get('movienum'), movie.get('movie_img')))
cursor.close()
conn.close()
def save_movies_as_json(movies, page):
data = {
'status': 'ok',
'code': 200,
'movies': movies
}
movie_json = json.dumps(data, ensure_ascii=False)
with open('%d.json' % page, 'a', encoding='utf-8') as w:
w.write(movie_json)
w.flush()
def get_next_url(html_etree):
flag = html_etree.xpath('//div[contains(@class,"page")]/a[last()]/@title')
base_url = 'https://www.xinpianchang.com/'
if not flag:
print('最后一页')
exit()
next_url = base_url + html_etree.xpath('//div[contains(@class,"page")]/a[last()]/@href')[0]
return next_url
if __name__ == '__main__':
url = 'https://www.xinpianchang.com/channel/index/sort-like?from=navigator'
page = 1
while url:
html_etree = get_movies(url)
movies = extract_movies(html_etree)
# save_movies_to_db(movies)
save_movies_as_json(movies, page)
next_url = get_next_url(html_etree)
url = next_url
page += 1
小结:
解决需要登录问题,需要在请求头中加入登录后的cookie
但是爬取到46页,程序停止了,那是因为服务器的反爬策略(频率反爬)