一、py-3版本requests的用法
import requests
import random
USER_AGENTS = []
# proxies = [{"http":"106.56.102.143:8070"},{ "http":"110.73.8.51:8123"}]
proxies1={
'http':'116.213.98.6:8080',
'https':'14.118.254.21:6666'
}
r=requests.get("http://www.baidu.com",
headers={'USER_AGENTS':random.choice(USER_AGENTS)},
proxies=proxies1) ####自动随机取
print(r)
二、实例
1、爬取51job职位
#-*-coding:utf-8-*-
import random
import re
import xlwt
# 01.获取天猫页的内容
url = 'https://www.tmall.com'
from urllib import request
import urllib
# -------------------------------job
def getHtml(url): # 获取网页内容
USER_AGENTS = [。。。] # 设置头部
proxies = ["123.138.89.1339:999",
"101.132.122.230:3128",
"222.186.12.102:57624"] # 代理IP #可变
req = request.Request(url) # 设置url地址
req.add_header('User-Agent', random.choice(USER_AGENTS)) # 随机选取浏览器
proxy_support = request.ProxyHandler({"http": random.choice(proxies)}) # 随机选取IP地址
opener = request.build_opener(proxy_support) # 获取网站访问的对象
request.install_opener(opener)
res = request.urlopen(req) # 处理浏览器返回的对象
html = res.read()
return html
#2返回网页对象
def joblist(jobname,pagenumber):
url="https://search.51job.com/list/000000,000000,0000,00,9,99,"+str(jobname)+",2,"+str(pagenumber)+".html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
html=getHtml(url)
#使用chardet模块
# code=chardet.detect(html)["encoding"]
html=html.decode("gbk").encode("utf-8")
#显示的是网页内用中的文字是中文
#2设置正则表达式
regstr='<p class="t1 ">.*?<a target="_blank" title="(.*?)".*?' \
'<span class="t2"><a target="_blank" title="(.*?)".*?' \
'<span class="t3">(.*?)</span>.*?' \
'<span class="t4">(.*?)</span>.*?' \
'<span class="t5">(.*?)</span>' #爬取的内容位置
reg=re.compile(regstr,re.S)
#3获取数据结果
result=re.findall(reg,html.decode("utf-8","replace"))
return result
# print(joblist("python",3))
#3全局的数据列表
datalist=[]
#4向全局datalist添加数据
def deal(pagenumber,jobname):
global datalist
# data=joblist(jobname)
#根据设置的页数用循环内多层次执行获取数据
for k in range(int(pagenumber)):
data=joblist(jobname,k+1)
for i in data:
datalist.append(i)
print(datalist)
#
#5设置存储的函数
def saveexcel(jobname,filename):
#保存
book=xlwt.Workbook(encoding='utf-8') ############创建工作簿,文字中文化
sheet=book.add_sheet(str(jobname))##########工作表
cols=(u'职位名',u'公司名',u'工作地点',u'薪资',u'发布时间') #表头数据
for i in range(len(cols)):
sheet.write(0,i,cols[i])
for i in range(len(datalist)):
for j in range(len(datalist[i])):
sheet.write(i+1,j,datalist[i][j])
book.save(u""+filename+u"")
# #
#
def savetext(filname):
for i in range(0,len(datalist)):
data=datalist[i]
with open("51job.txt","a")as f:
f.write(data[0]+'\t'+data[2]+'\t'+data[3]+'\t'+data[4]+'\t')
f.close()
return
#
def main(jobname,pagenumber,filename):
deal(pagenumber,jobname)
if "txt" in filename:
savetext(filename)
if "xls" in filename:
saveexcel(jobname,filename)
main('python',2,u"py语言.xls")
2、天猫链接爬取
import xlwt
获取天猫页的内容
url = 'https://www.tmall.com'
from urllib import request
import urllib
from urllib import parse
# #------------------------------------天猫
url = 'https://www.tmall.com'
html = getHtml(url).decode('utf-8')
# 02. 设置正则表达式
href="//list.tmall.com/search_product.htm?from=mallfp..pc_1.0_hq&click_id=针织衫&q=针织衫"
reg = re.compile('<a href="(.*)">(.*)</a>')
links = re.findall(reg,html)
# print(len(links))
# 存入本地excel
# 01.获取工作簿对象,别忘了设置编码,或者下面的字符串加上 u'天猫'
wbk = xlwt.Workbook(encoding='utf-8')
# 02.创建一个工作表
sheet = wbk.add_sheet('天猫')
# 03.设置第一行的内容
col = ('编号', '内容', '链接')
for i in range(len(col)):
sheet.write(0, i, col[i])
# 04.设置 存入本地的序号
for i in range(len(links)):
sheet.write(i+1,0,i+1)
for j in range(len(links[i])):
sheet.write(i+1,j+1,links[i][j])
# 05. 存储到文件
wbk.save('tianmao.xls')
3、获取豆瓣电影的排行
import requests
import chardet
from bs4 import BeautifulSoup
import random
#获取网页某页的内容
def getHtml(index):
USER_AGENTS = [...] # 头部浏览器
proxies = {"HTTP":"117.63.78.64:6666",
"HTTPS":"114.225.169.215:53128",
"HTTPS":"222.185.22.108:6666"} # 代理IP
url="https://movie.douban.com/top250?start="+str(index*25)+"&filter="
r=requests.get(url,header={'USER_AGENTS':random.choice(USER_AGENTS)},proxies=(proxies))
code=chardet.detect(r.content)["encoding"]
return r.content.decode(code)
#设置一个总的列表装数据
data = []
import re
reg=re.compile('.*?(\d{4}).*?')
def getData(m):
for i in range(m):
html=getHtml(m) #网页的内容
soup=BeautifulSoup(html,'html.parser')
parent=soup.find("div",attrs={'id':'content'})
#获取左右<li>
lis=parent.find_all("li")
for i in lis:
name=i.find('div',attrs={'class':'pic'}).find('a').find('img')['alt']
time=i.find("div",attrs={'class':"info"}).find('div',attrs={'class':'bd'}).find("p").get_text()
time1 = re.findall(reg,time)[0]#选取所获取的数据的第一个
socre=i.find('div',attrs={'class':'star'}).find('span',attrs={'class':'rating_num'}).string
num=i.find("div",attrs={'class':"info"}).find('p',attrs={'class':'quote'}).find('span').string
data.append([name,time1,socre,num])
return data
#写入
import xlwt
def main(n,filename):
listsum=getData(n)
workbook=xlwt.Workbook(encoding='utf-8')
sheets=workbook.add_sheet("电影")
cols=["电影名称","上映时间","评分","评价"]
for i in range(len(cols)):
sheets.write(0,i,cols[i])
for i in range(len(listsum)):
for j in range(len(listsum[0])):
sheets.write(i+1,j,listsum[i][j])
return workbook.save(filename)
main(4,"豆瓣电影排行.xls")
print('ok!!!')
4、获取代理IP
import requests
import chardet
from bs4 import BeautifulSoup
import random
#获取网页某页的内容
def getHtml(pagenumber):
USER_AGENTS = [...] # 浏览器
proxies = [{"HTTP":"117.63.78.64:6666"},
{"HTTPS":"114.225.169.215:53128"},
{"HTTPS":"222.185.22.108:6666"}] # 代理IP
url="http://www.xicidaili.com/nn/"+str(pagenumber)
r = requests.get(url, headers={"User-Agent": random.choice(USER_AGENTS)},proxies=random.choice(proxies))
code = chardet.detect(r.content)["encoding"]
r.encoding = code
soup = BeautifulSoup(r.text, "html.parser")
IPList = soup.find('div',attrs={'id':'body'}).find('table',attrs={'id':"ip_list"}) #父类数据
IP=IPList
# print(IP)
return IP
# getHtml(1)
List=[]
list2=[]
import re
reg=re.compile(('(\d+).*?')) #判断数字的时间
reg2=re.compile('\d+(\w+)') #判断是分钟还是天数
def getip(page):
for i in range(1,page+1):
tr=getHtml(i).find_all('tr',attrs={'class':'odd'}) #找到全部tr
for j in tr: #每个tr进行遍历
td=j.find_all('td')
tdIP=td[1].string #IP地址
tddkou=td[2].string #端口号
tdhttp=td[5].string
tdtimealive=td[8].string #存货时间
print(tdtimealive)
time=re.findall(reg,tdtimealive)[0] #不加【0】的话是一个元素的列表 time 是一个的数字串
List.append([tdhttp,tdIP,tddkou,tdtimealive])
for k in List:
# print(k[1])
if ("天" in k[3]) and int(time)>=1:
list2.append(k) #筛选后的结果
# print(list2)
return list2 #得到的是所有信息的大列表
# getip(1)
import xlwt
def main(n,filmname):
list3=getip(n)
workbook=xlwt.Workbook(encoding='utf-8')
sheets=workbook.add_sheet("IP")
cols=["IP号","IP","端口","存活时间"]
for i in range(len(cols)):
sheets.write(0,i,cols[i])
for i in range(len(list3)):
for j in range(len(list3[0])):
# print(len(listsum[0]))
sheets.write(i+1,j,list3[i][j])
return workbook.save(filmname)
main(1,"IP代理.xls")
5、抓取图片
#-*-coding:utf-8-*-
# 方法,使用 urllib.urlretrieve() 方法直接将远程数据下载到本地
from urllib import request
import os
import random
import requests
from bs4 import BeautifulSoup
import chardet
imgList1=[]
def getHtml(number):
USER_AGENTS = [...] # 浏览器
proxies = [{"HTTP":"117.63.78.64:6666"},
{"HTTPS":"114.225.169.215:53128"},
{"HTTPS":"222.185.22.108:6666"}] # 代理IP
url = "http://www.27270.com/tag/637_"+str(number)+".html"
r=requests.get(url,headers={"User-Agent":random.choice(USER_AGENTS)},proxies=random.choice(proxies))
code = chardet.detect(r.content)["encoding"]
r.encoding = code
soup = BeautifulSoup(r.text, "html.parser")
img = soup.find('ul',attrs={'id':'Tag_list'}).find_all("img")
for i in img:
imgList1.append(i)
return imgList1
getHtml(1)
# # 保存
def getImages(pageNum,name):
#创建文件夹
if os.path.exists(name):
os.rmdir("photos")
else:
os.mkdir(name)
os.chdir(name)
global address,images,imgLen
for k in range(pageNum):
# 1、存储soup对象
eachsoup = getHtml(pageNum)
print(eachsoup) #所有连接组成的列表
#3、用循环处理所有li内的具体内容
for i in eachsoup:
#获取图片后缀名,防止真实网址图片为png,jpg,gif等格式
suffix = i['src']
print(suffix) #链接
image_name = i['alt']
print(image_name)
request.urlretrieve(suffix,image_name+str('.jpg'))
return
getImages(3,'美女')
三、其他
1正则表达式
1、1pattern用法
pattern=re.compile(Anystring,flag=)
flag参数:参数flag式匹配模式,取值可以使用按位或运算符”|”表示同时生效,比如re.I|re.M
re.I(全拼:Ignorecase):忽略大小写
re.M(全拼:Multiline):多行模式,改变”^”和”$”的行为
re.S(全拼:Dotall):点任意匹配模式,改变”.”的行为
re.L(全拼:Locale):使预定字符类\w\W\b\B\s\S取决于当前区域设定
re.U(全拼:Unicode):使预定字符类\w\W\b\B\s\S取决于unicode定义的字符属性
re.X(全拼:Verbose):详细模式,这个模式下正则表达式可以是多行,忽略空白字符,并可以加入注释
1.3正则常用的符号
1、3表达式事例
#正则表达式
import re
pattern=re.compile('hello'.re.I)#############是否忽略大小写
####################### match匹配
res1=re.match(pattern,"hello")
res2=re.match(pattern,"hello CQC")
res3=re.match(pattern,"he")
res4=re.match(pattern,"Hello")
##从开头开始,若匹配到,则返回存储的地址,要是没哟,则返回None
res5=re.search(pattern,"Hello")
#只要有就型
#
#
print(res1)
print(res2)
print(res3)
print(res4)
print(res5)
#
#
#######################其他匹配
#^:开头,$:结尾
reg=re.compile('k')
print(re.search(reg,"he is jack"))
#单词边界\b,默认是推格字符,单词边界,加上r'\b'
reg=re.compile(r'\bis{2}\b')
print(re.search(reg,"he is is haha is hahahhaha"))
reg2=re.compile('\w{2}') ########{}代表次数
reg3=re.compile('\d{2}')################连续的
reg4=re.compile('[a-z]\d[a-z]{3}') ####[a-z]{3}格式
reg5=re.compile('([a-z]\d[a-z]){3}')####以 a5c 的形式查找
reg6=re.compile('\d{2,4}')##############判断是否存在连续的2或者3或者4个不同或者相同的数字
reg7=re.compile('\d{2}\d{3}\d{4}')######判断是否存在连续的9个不同或者相同的数字
print(re.search(reg6,"d23234ed"))
print(re.search(reg,"he is jack is haha is hahahhaha"))
print(re.search(reg2,"he is jack is haha is hahahhaha"))
#次数 \w \d \s * + ?
reg=re.compile('\d+') ############代表一个以上的数字
print(re.search(reg,"abc1"))
reg=re.compile('\d{11}')#############
reg=re.compile('138\d{8}')
reg=re.compile('1[2-9]\d{9}')#######只读取后9位
print(re.findall(reg,'1239194798410478174}'))
#邮箱
reg10=re.compile('[a-zA-Z1-9]+\w+@\w*\.\w+')## “+”一个以上,
# “*”表示0个以上,
# "."任意字符
2、中文转码处理
import urllib
from urllib import parse
from urllib import request
str2='人工智能'
print(urllib.parse.quote(str2)) #加密
str3='%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD'
print(urllib.request.unquote(str3)) #解密
str4='%25E4%25BA%25BA%25E5%25B7%25A5%25E6%2599%25BA%25E8%2583%25BD'
print(urllib.request.unquote(urllib.parse.unquote(str4)))
3、BeautifulSoup
from bs4 import BeautifulSoup
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
#限定爬取的内容,正则
import re
reg=re.compile('\w{1}')
soup=BeautifulSoup(html,"html.parser")
# print(soup)
print(soup.find_all('p',text=reg))
print(soup.find('boby').find_all('a',recursive=False))
########################美丽泡泡
#获取对象,只用的默认的解析器
soup=BeautifulSoup(html,'html.parser')
# print(soup)
print(soup.title)
print(soup.title.name)###title
print(soup.title.string)###The Dormouse's story
print(soup.title.text)####与上面那个相同
#通过上下级关系,获取对象parent
print(soup.title.parent)
print("---------------------------------------------------------------------------------")
#通过上下级关系,获取对象chilren
print(soup.p)
print('---------------------------------------------------------------------------------')
for i in soup.p: #回车也占用长度
print(i)
print('---------------------------------------------------------------------------------')
print(soup.find('p'))
print(soup.find_all('p'))
print('---------------------------------------------------------------------------------')
# print(soup.head)
a=soup.a
print(a)
print(a.attrs)
print(a.id)
print(a.get('id'))
print(a['id'])
print(a.text)
print('---------------------------------------------------------------------------------')
print(soup.find_all('a',{'id':'link3'}))
print(soup.find_all('a',{'class':'sister'}))
for i in soup.find_all('a',{'class':'sister'}):
print(i['href'])
四、部分头部信息
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 5.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.8 (KHTML, like Gecko) Chrome/4.0.302.2 Safari/532.8",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.464.0 Safari/534.3",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.186 Safari/535.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.54 Safari/535.2",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
"Mozilla/5.0 (Macintosh; U; Mac OS X Mach-O; en-US; rv:2.0a) Gecko/20040614 Firefox/3.0.0 ",
"Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.0.3) Gecko/2008092414 Firefox/3.0.3",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1) Gecko/20090624 Firefox/3.5",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.14) Gecko/20110218 AlexaToolbar/alxf-2.0 Firefox/3.6.14",
"Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)"
]