爬虫
简单的爬取一下易烊千玺的图片
import urllib.request
import urllib.parse
#编码的,把汉字转化成url里面的参数
url = 'http://www.baidu.com/s?'
wd = input('请输入你要搜索的内容:')
pn = int(input('请输入你要跳转的页数:'))
kw = {'wd':wd}
kw =urllib.parse.urlencode(kw)
url = url + kw
for i in range(1,pn+1):
pn = (i-1)*10
full_url = url + '&pn=' + str(pn)
print(full_url)
反爬虫
上面的四个方法都可以阻止我们爬虫
爬虫都可以破解的,比如说百度,你爬一次之后,后面就不让了,所以我们解决一下子
解决:
带上UA 和 cookie
用面向对象来写
import urllib.request
import urllib.parse
# 导入Python SSL处理模块
import ssl
class BaiDuSpider():
def __init__(self):
self.base_url = 'http://www.baidu.com/s?'
# 表示忽略未经核实的SSL证书认证
self.context = ssl._create_unverified_context()
self.headers = {
'User_Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
'Cookie':'BIDUPSID=A788109073D25C5835442A1CE0B02C4E; PSTM=1586859523; BAIDUID=A788109073D25C58F7C6049D2995F5B4:FG=1; BD_UPN=12314753; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; sugstore=1; H_WISE_SIDS=148503_147408_143879_147277_148321_147893_148193_145870_145332_147280_146537_148002_147847_147828_147891_147347_127969_146034_147238_146551_148207_145418_147024_146732_131423_128700_132548_147527_125124_107313_146849_146824_146396_144966_147301_145607_148070_148346_144762_146053_145395_110085; H_PS_PSSID=31729_1434_31672_21125_31069_31762_31271_31714_30824_31846; delPer=0; BD_CK_SAM=1; PSINO=5; ZD_ENTRY=baidu; COOKIE_SESSION=146329_0_8_7_10_6_0_0_8_5_1_0_0_0_9_0_1591442784_0_1591589104%7C9%23262_3_1589796774%7C2; BD_HOME=1; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[CLK3Lyfkr9D]=mk3SLVN4HKm'
}
def send_request(self,full_url):
#发送请求
#response = urllib.request.urlopen(full_url,context=self.context)
#上面这种,带不了UA,我们可以构建一个请求,自己在请求头里面加上UA
request = urllib.request.Request(full_url,headers=self.headers)
response = urllib.request.urlopen(request,context=self.context)#发送请求
if response.getcode() == 200:
return response
else:
print('出错了')
def parse_content(self):
#解析内容
pass
def save_content(self,response,i):
#保存内容
with open('baidu_%d.html'%i,'wb') as f:
f.write(response.read())
def start(self):
#开始爬虫
wd = input('请输入你要搜索的内容:')
pn = int(input('请输入你要跳转的页数:'))
kw = {'wd': wd}
kw = urllib.parse.urlencode(kw)
for i in range(1, pn + 1):
pn = (i - 1) * 10
full_url = self.base_url + kw + '&pn=' + str(pn)
print(full_url)
response = self.send_request(full_url)
self.save_content(response,i)
if __name__ == '__main__':
bds = BaiDuSpider()
bds.start()
只能爬取一次,后面就…emmmm。。。。
爬内涵段子
https://www.neihanba.com/dz/
先分析URL
就分析参数,比如说页数的变化
代码
import urllib.request
import urllib.parse
# 导入Python SSL处理模块
import ssl
class NeiHanBaSpider():
def __init__(self):
self.base_url = 'https://www.neihanba.com/dz/'
self.context = ssl._create_unverified_context()
self.headers = {
'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
}
def send_request(self,full_url):
#你这块发送的请求,得看开始爬虫这个函数给的什么
print(full_url)
request = urllib.request.Request(full_url,headers = self.headers)
response = urllib.request.urlopen(request,context=self.context)
if response.getcode() == 200:
return self.parse_content(response)
else:
print('出错了')
def save_content(self):
pass
def parse_content(self,response):
import urllib.request
import urllib.parse
# 导入Python SSL处理模块
import ssl
class NeiHanBaSpider():
def __init__(self):
self.base_url = 'https://www.neihanba.com/dz/'
self.context = ssl._create_unverified_context()
self.headers = {
'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
}
def send_request(self,full_url):
#你这块发送的请求,得看开始爬虫这个函数给的什么
print(full_url)
request = urllib.request.Request(full_url,headers = self.headers)
response = urllib.request.urlopen(request,context=self.context)
if response.getcode() == 200:
return self.parse_content(response)
else:
print('出错了')
def save_content(self):
pass
def parse_content(self,response):
html = response.read().decode('gb2312')#这块已经转成字符串了,所以保存的时候就不用写wb不用了
with open('neihan.html','w') as f:
f.write(html)
def start(self):
for i in range(1,2):
if i == 1:
self.send_request(self.base_url)
else:
full_url = self.base_url + 'list_%d.html'%i
self.send_request(full_url)
if __name__ == '__main__':
nh = NeiHanBaSpider()
nh.start())#这块已经转成字符串了,所以保存的时候就不用写wb不用了
with open('neihan.html','w') as f:
f.write(html)
def start(self):
for i in range(1,2):
if i == 1:
self.send_request(self.base_url)
else:
full_url = self.base_url + 'list_%d.html'%i
self.send_request(full_url)
if __name__ == '__main__':
nh = NeiHanBaSpider()
nh.start()
这样的话,运行就有爬出来了页面
代码解释
response = urllib.request.urlopen(request,context=self.context)
这个urlopen是网页抓取,前面的请求发送成功之后,下面就开始抓取数据
html = response.read().decode(‘gb2312’)
这块的解码,主要是看你的网页用的什么语言,查看网页源代码即可
爬取并保存里面的div内容
做一个被别人需要的女人!而不是做一个需要别人的女人!最近看到很火的一句话:你笑我拼命挣钱狼狈不堪,我笑你离开“男人”吃饭都难。这话好经典!送给努力
import urllib.request
import urllib.parse
# 导入Python SSL处理模块
import ssl
import re
class NeiHanBaSpider():
def __init__(self):
self.base_url = 'https://www.neihanba.com/dz/'
self.context = ssl._create_unverified_context()
self.headers = {
'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
}
def send_request(self,full_url):
#你这块发送的请求,得看开始爬虫这个函数给的什么
print(full_url)
request = urllib.request.Request(full_url,headers = self.headers)
response = urllib.request.urlopen(request,context=self.context)
if response.getcode() == 200:
return self.parse_content(response)
else:
print('出错了')
def save_content(self):
pass
def parse_content(self,response):
html = response.read().decode('gb2312')#这块已经转成字符串了,所以保存的时候就不用写wb不用了
# with open('neihan.html','w') as f:
# f.write(html)
#w是字符串 wb是二进制
#我已经爬下来保存了,接下来爬div内容
pattern = re.compile(r'<div class="f18 mb20">(.*?)</div>',re.S)
#re.S换行,加?的原因,是关闭贪婪模式
result = pattern.findall(html)
print(result)
with open('dz.html','w') as f:
f.write(html)
def start(self):
for i in range(1,2):
if i == 1:
self.send_request(self.base_url)
else:
full_url = self.base_url + 'list_%d.html'%i
self.send_request(full_url)
if __name__ == '__main__':
nh = NeiHanBaSpider()
nh.start()
案例
爬百度贴吧
import urllib.request
import urllib.parse
import re
import ssl
class BaiDuSpider():
def __init__(self):
self.base_url = 'https://tieba.baidu.com/f?'
self.context = ssl._create_unverified_context()
self.headers = {
'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
}
def send_request(self,full_url):
request = urllib.request.Request(headers=self.headers,url=full_url)
response = urllib.request.urlopen(request,context=self.context)
# with open('tieba.html','wb') as f:
# f.write(response.read())
if response.getcode() == 200:
return response
def parse_content(self,response):
content = response.read().decode('utf-8')#网页的内容返回后要解码
links = re.findall(r'href="(/p/\d+)"',content,re.S)#返回的列表
for link in links:
detail_url = 'https://tieba.baidu.com' + link
response = self.send_request(detail_url)
self.parse_detail(response)
def parse_detail(self,response):
content = response.read().decode('utf—8')
pic_link = re.findall('<img\sclass="BDE_Image"\ssrc="(.*?)".*?>',content,re.S)
for pic in pic_link:
response = self.send_request(pic)
self.save_content(response,pic)
def save_content(self,response,pic):
name = pic.rsplit('/',1)[1]
print('正在保存%s'%name)
with open('D:\python_create_files\爬虫\pics','wb') as f:
f.write(response.read())
def start(self):
kw = input('请输入你要爬取的贴吧的名字')
#开始编码,翻译成浏览器的语言
kw = urllib.parse.urlencode({'kw':kw})
self.base_url = self.base_url + kw
page = int(input('请输入你要爬取的页数'))
for i in range(1,page + 1):
pn = (i-1) * 50
full_url = self.base_url + '&pn=' + str(pn)
print(full_url)
response = self.send_request(full_url)
self.parse_content(response)
if __name__ == '__main__':
bd = BaiDuSpider()
bd.start()