1、获取规律网址
//例1
template = 'http://book.douban.com/tag/小说?start={param}&type=T'
for page in range(1,11):
url = template.format(param=(page-1)*20)
print('第{}页url:'.format(page),url)
//例2
import requests
url = 'http://bbs.tianya.cn/hotArticle.jsp?pn='
for page in range(2,8):
url = url+str(page)
print(url)
url = 'http://bbs.tianya.cn/hotArticle.jsp?pn='
2、不添加伪装浏览器头
import requests
url = 'http://bbs.tianya.cn/hotArticle.jsp?pn=1' #天涯可以
url = 'http://book.douban.com/tag/小说?start=0&type=T' #豆瓣阅读不可以
url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=40'
#豆瓣电影不可以
resp = requests.get(url)
print(resp.text)
3、添加伪装浏览器头
import requests
h = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/72.0.3626.109 Safari/537.36'}
url = 'http://book.douban.com/tag/小说?start=0&type=T' #豆瓣阅读可以
url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=40'
#豆瓣电影可以
resp = requests.get(url,headers = h)
print(resp.text)
4、数据解析
# 引入pyQuery库
pip3 install pyquery
# CSS选择器
doc('.xxx') #根据class属性 class=“xxx” 类选择器
doc('#id名') #根据id属性 id=“xx” ID选择器
doc('body ul') #根据节点属性 body ul 标签选择器
doc("ul[id = container]") #多种css选择器 doc('html #id名')
# 伪类选择器
:nth-child(n) p:nth-child(n) 找到第n个p节点标签
:first-child p:first-child 找到第一个p节点标签
:last-child p:last-child 找到最后一个p节点标签
:contains() p:contains('second') 找到包含second的节点标签
# 查找标签
doc.items(selector) #迭代每一个元素
item.eq(n) #迭代中查找第n个节点
eq(n).text() #获取节点中的文本
eq(n).text() #获取节点属性值
使用方法
import requests
from pyquery import PyQuery #引入PyQuery库使获取HTML格式规范
url = "http://bbs.tianya.cn/hotArticle.jsp?pn=1"
doc = requests.get(url)
doc = PyQuery(doc.text)
print(doc) #打印网页代码
print(type(doc)) #打印doc类型
print(doc('ul')) #通过节点名获取
print(doc('.links')) #通过class获取
print(doc('#top_search')) #通过id获取
print(doc("li[id=search_zone_list]")) #选出含条件的节点 id
print(doc("li[class=clearfix]")) #选出含条件的节点 class
print(doc("html #top_search")) #选出html下的id
print(doc("html body li")) #多级选择
print(doc("li:contains('视频专区')")) #查找出含xxx的项
print(doc("html body li").text()) #得到值
for i in doc.items('li'): #迭代访问字段
print( i.text()) #迭代访问字段
print([i.text() for i in doc.items('li')]) #获取字典格式
print([i.attr('class') for i in doc.items('li')]) #获取类名、title等
5、存放数据到text中
import requests
from pyquery import PyQuery #引入PyQuery库使获取HTML格式规范
url = "http://bbs.tianya.cn/hotArticle.jsp?pn=1"
doc = requests.get(url)
doc = PyQuery(doc.text)
print(doc("html body li").text())
txtf = 'C:/Users/Administrator/Desktop/a.txt'
f = open(txtf,'a+',encoding='utf-8')
content = doc("html body li").text()
f.write(content)
f.close()
6、存放数据到csv中
import csv
file = 'C:/Users/Administrator/Desktop/a.csv'
csvf = open(file,'a+',encoding='gbk',newline='')
writer = csv.writer(csvf)
line1=('name','gender','age')
writer.writerow(line1)
line2=('Tony','boy','22')
writer.writerow(line2)
line3=("Jeremy","boy",'21')
writer.writerow(line3)
csvf.close()
7、实例-天涯论坛爬贴
import requests
from pyquery import pyquery, PyQuery
#url = 'http://bbs.tianya.cn/hotArticle.jsp?pn=1'
url = 'http://www.allconfs.org/list.asp?yearid=2020'
resp = requests.get(url)
doc = PyQuery(resp.text)
# for item in doc.items('table tbody tr td a')://为了同时拿到标题与作者名 不用该方法
# print(item)
# 写入csv文件
import csv
csvf = open('C:/Users/Administrator/Desktop/a.csv','a+',encoding='gbk',newline='')
writer = csv.writer(csvf)
writer.writerow(('title','author'))
# print("title","author")
for item in doc.items('table tbody tr'):
title = item('td').eq(0).text()
author = item('td').eq(1).text()
# print(title,author)
writer.writerow((title, author))
# print(doc)
8、实例-电影榜单
# 1 电影排行榜-写入txt
# 1 电影排行榜-写入txt
import requests
from pyquery import PyQuery as py
h = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/65.0.3325.162 Safari/537.36'
}
res = requests.get("https://www.imdb.cn/imdbtop-chinese-movies/",headers = h)
with open('movie.txt', 'w') as k:
k.write("")
doc = py(res.text)
for i in doc.items(".rl_name a"):
print(i.text())
with open('movie.txt', 'a') as k:
k.write(i.text()+'\n')
9、 文件保存
import requests
# 1、 .jpg
# res1 = requests.get("https://ss1.bdstatic.com/70cFvXSh_Q1YnxGkpoWK1HF6hhy/it/u=1392186382,1573605807&fm=26&gp=0.jpg")
# with open('image.jpg','wb') as k:
# k.write((res1.content))
# 2、 .mp3
# res2 = requests.get("https://www.adrive.com/public/q3Kdbe/wave.mp3")
# with open('wave.mp3','wb') as k:
# k.write((res2.content))
# 3、 .gif
# res3 = requests.get("https://www.sample-videos.com/gif/3.gif")
# with open('a.gif','wb') as k:
# k.write((res3.content))
# 4、 .mp4
# res4 = requests.get("http://vjs.zencdn.net/v/oceans.mp4")
# with open('sea.mp4','wb') as k:
# k.write((res4.content))