爬虫流程
爬虫入门代码示例
import requests
url = "https://www.sogou.com/web"
rep = requests.get(url=url)
repEncode = rep.encoding
with open(filename, "w", encoding=repEncode) as f:
f.write(rep.text)
关于响应结果
方法 | 结果 |
---|
response.status_code | 状态码, 正常为200 |
response.encoding | 网页原字符编码 |
response.text | 字符串 |
response.content | bytes类型结果 |
response.url | 请求的url |
response.headers | 请求头信息 |
通用爬虫 - requests模块
代码示例
get请求
"""
带参数发送get请求, 之后所有代码都会进行ua伪装
"""
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
url = "https://www.sogou.com/web"
params = {
"query": "美女图片",
}
rep = requests.get(url=url, params=params, headers=headers)
repEncode = rep.encoding
with open(filename, "w", encoding=repEncode) as f:
f.write(rep.text)
post请求
import requests
url = "xxx"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
data = {
"keyA": "valueA",
"keyB": "valueB",
}
rep = requests.post(url=url, data=data, headers=headers)
repText = rep.text
print(repText)
ajax get请求
import requests
url = "http://image.so.com/zjl?ch=beauty&sn=60&listtype=new&temp=1"
param = {
"ch": "beauty",
"sn": "60",
"listtype": "new",
"temp": "1",
}
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
rep = requests.get(url=url, params=param, headers=headers)
repEncode = rep.encoding
repText = rep.text
print(repText)
with open("360pic美女.html","w",encoding=repEncode) as f:
f.write(repText)
ajax post请求
import requests
url = "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword"
data = {
"cname": "",
"pid": "",
"keyword": "北京",
"pageindex": "3",
"pageSize": "10",
}
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
rep = requests.post(url=url, data=data, headers=headers)
print(rep.text)
练习 - 普通爬虫
import requests
url = "https://www.sogou.com/sogou"
keyword = input("input key word: ")
startPage = int(input("input start page: "))
endPage = int(input("input end page: "))
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
for i in range(startPage, endPage+1):
params = {
"query": keyword,
"page": str(i),
}
rep = requests.get(url=url, params=params, headers=headers, proxies={"http": "117.127.16.207:8080"})
repEncode = rep.encoding
fileName = "{}_{}.html".format(keyword, i)
with open(fileName, "w", encoding=repEncode) as f:
f.write(rep.text)
聚焦爬虫
正则
介绍
正则表达式
代码示例
import requests
import re
import os
url = "https://www.qiushibaike.com/pic/"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
rep = requests.get(url=url,headers=headers)
repText = rep.text
re_rule = '<div class="thumb">.*?<img src="//(.*?)" .*?</div>'
img_url = re.findall(re_rule, repText, re.S)
if not os.path.exists("./qiutu"):
os.mkdir("qiutu")
for i in img_url:
url = "http://" + i
r = requests.get(url=url, headers=headers)
rCont = r.content
filename = i.split("/")[-1]
file = "./qiutu/" + filename
with open(file,"wb") as f:
f.write(rCont)
print("{}下载完成".format(file))
xpath
xpath实例化
tree = etree.parse(文件名)
tree.xpath("xpath表达式")
tree = etree.HTML(网页内容字符串)
tree.xpath("xpath表达式")
xpath表达式
属性定位:
//div[@class="song"]
层级&索引定位:
//div[@class="tang"]/ul/li[2]/a
逻辑运算:
//a[@href="" and @class="du"]
模糊匹配:
//div[contains(@class, "ng")]
//div[starts-with(@class, "ta")]
取文本:
//div[@class="song"]/p[1]/text()
//div[@class="tang"]//text()
取属性:
//div[@class="tang"]//li[2]/a/@href
代码示例
import requests
from lxml import etree
import xlwt
def write_excel(data):
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('My Worksheet')
for i in range(len(data)):
for j in range(len(data[i])):
if len(data[i][j]) > 1:
data[i][j] = "{}, {}".format(data[i][j][0], data[i][j][1])
worksheet.write(i, j, data[i][j])
workbook.save('Excel_test.xls')
def spider():
"""
爬取特定url的数据
并将数据处理, 返回列表
:return:
"""
url = "http://newgame.17173.com/shouyou/ceshi"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
rep = requests.get(url=url, headers=headers)
rep_text = rep.text
tree = etree.HTML(rep_text)
div_list = tree.xpath('//div[@class="g-box4 box"][1]//ul[2]/li')
ret = [["游戏名", "上线时间", "测试类型", "游戏类型", "平台", "工作室"]]
for i in div_list:
name = i.xpath('.//h6[@class="c1"]/a/text()')
time = i.xpath('.//p[@class="c2"]/text()')
qa_type = i.xpath('.//p[@class="c3"]/text()')
game_type = i.xpath('.//i[@class="c4"]/text()')
plate = i.xpath('./p[@class="c5"]/span//text()')
auth = i.xpath('.//span[@class="c7"]/text()')
data = [name, time, qa_type, game_type, plate, auth]
ret.append(data)
return ret
if __name__ == "__main__":
data = spider()
write_excel(data)
bs4
使用
soup = BeautifulSoup(open('localfile'), 'lxml')
soup = BeautifulSoup(bytes类型数据, 'lxml')
使用bs4对象的方法
方法
soup.a
soup.a.attrs
soup.a.attrs["href"]
soup.a["href"]
soup.a.string
soup.a.text
soup.a.get_text()
soup.find("a")
soup.find("a", title="xxx")
soup.find("a", class_="xxx")
soup.find("a", id="xxx")
soup.find_all("a")
soup.find_all(["a","b"])
soup.find_all("a", limit=2)
soup.select("#xxx")
soup.select(".book-mulu > ul > li")
选择器
- 普通选择器
- 标签选择器(div)
- 类选择器(.)
- id选择器(#)
- 层级选择器
- . a #b .c #d --> //(后代选择器)
- .a>#b>.c>#d --> /(子代选择器)
代码示例
import requests
from bs4 import BeautifulSoup
url = "http://www.shicimingju.com/book/sanguoyanyi.html"
url_root = "http://"+url.split('/')[2]
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
rep = requests.get(url=url, headers=headers)
rep_text = rep.text
soup = BeautifulSoup(rep_text,'lxml')
fp = open('./sanguo.txt','w',encoding="utf-8")
li_list = soup.select(".book-mulu > ul > li")
for i in li_list:
title = i.a.text
content = url_root+i.a["href"]
content_page_text = requests.get(url=content, headers=headers).text
soup = BeautifulSoup(content_page_text, "lxml")
content_text = soup.find('div', class_="chapter_content").text
fp.write(title+""+content_text)
fp.close()
cookie的使用
session = requests.session()
session.post(url=url, data=data, headers=headers)
session.get(url=url, params=params, headers=headers)
代码示例
import requests
url = "http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2019731514156"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
data = {
"email":"15810020949",
"icode":"",
"origURL":"http://www.renren.com/home",
"domain":"renren.com",
"key_id":"1",
"captcha_type":"web_login",
"password":"xxxxxxxxxxxxxxxxxxxxxxxxx",
"rkey":"386db1871dce1b360f18ae81a91aabdf",
"f":"http%3A%2F%2Fwww.renren.com%2F266632989",
}
session = requests.session()
session.post(url=url, headers=headers, data=data)
url = "http://www.renren.com/266632989/newsfeed/photo"
rep = session.get(url=url, headers=headers)
with open('./renren.html','w',encoding='utf-8') as f:
f.write(rep.text)