提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
前言
爬虫系列。
提示:以下是本篇文章正文内容,下面案例可供参考
一、数据爬取
示例:
二、使用步骤
1.引入库
代码如下(示例):
import requests
from lxml import etree
from selenium import webdriver
import os
from selenium.webdriver.chrome.options import Options
import csv
import pymysql
2.读入数据
代码如下(示例):
def get_url():
base_url = 'http://pvp.qq.com/web201605/herolist.shtml'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36',
'Cookie':'RK=swQA3R4MTO; ptcz=fbbbfb166ef6016e5ef8db808b8e3fc849a10172cbfbf171112f9bf6246b8f1f; pgv_pvi=1645316096; pgv_pvid=3000228160; tvfe_boss_uuid=076500798fce0fbe; LW_uid=H1Q506i6v039A431t746D0l7m1; eas_sid=G1q5z6m6n059z4Z1q7J6H0a7t9; LW_sid=K1k5J6R888L9k9w3z0M5J1G2n6; ied_qq=o1248452992; o_cookie=1248452992; pac_uid=1_1248452992; uin_cookie=o1248452992; isHostDate=18389; isOsSysDate=18389; isOsDate=18389; PTTuserFirstTime=1588809600000; PTTosSysFirstTime=1588809600000; PTTosFirstTime=1588809600000; pgv_info=ssid=s9336335392; ts_uid=3000228160; weekloop=0-0-0-19; ieg_ingame_userid=i9k868I8dfvOg6iKeu5I8xnEmvJUtQBi; pt2gguin=o1248452992; ts_last=pvp.qq.com/web201605/herodetail/190.shtml; pvpqqcomrouteLine=herolist_herodetail_herodetail; PTTDate=1588834714567',
}
response = requests.get(url=base_url, headers = headers)
response.encoding = 'gbk'
html = response.text
selector = etree.HTML(html)
url_list = selector.xpath("//ul[@class='herolist clearfix']/li/a/@href")
for url in url_list:
url = 'http://pvp.qq.com/web201605/'+url
yield url
def get_name_and_url(url):
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
chrome = webdriver.Chrome(options=options)
chrome.get(url)
lis = chrome.find_elements_by_css_selector('.pic-pf-list li')
for li in lis:
img_url = 'http:'+li.find_element_by_tag_name("img").get_attribute("data-imgname")
name = li.find_element_by_tag_name("p").text
yield name, img_url
def save_to_local(name, img_url):
img = requests.get(img_url).content
if os.path.exists('./wangzhe/') == False:
os.makedirs('./wangzhe/')
with open('./wangzhe/' + name + '.png', 'wb') as f:
f.write(img)
f.flush()
print('success')
f.close()
def insertMysql(name, img_url):
connect = pymysql.connect(host='localhost', user='root', password='123456', database='spiders', port=3306)
cursor = connect.cursor()
sql = 'insert into wangzhe(name, img_url) values(%s, %s)'
cursor.execute(sql, (name, img_url))
connect.commit()
cursor.close()
connect.close()
def save_to_csv(name, img_url):
with open('wangzhe.csv', 'a', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
writer.writerow((name, img_url))
print('success')
for url in get_url():
for name, img_url in get_name_and_url(url):
save_to_local(name, img_url)
该处使用的url网络请求的数据。
总结
提示:这里对文章进行总结:
例如:以上就是今天要讲的内容,本文仅仅简单介绍了pandas的使用,而pandas提供了大量能使我们快速便捷地处理数据的函数和方法。