豆瓣爬虫代码
这里是爬去豆瓣图书的,修改一下爬什么电影/音乐应该很easy
架构:
1、urlib request使用,浏览器header伪装(随机从不同header选择),汉字代码解码
我认为本人做得比较好的
2、ip地址爬取,豆瓣查ip访问统计的,同一ip访问次数多就封掉,因此,我们使用代理。为了不被封,我们使用不同的代理ip,但是如何获得?我们去爬ip,用爬到的ip再去爬ip。如此循环,完美。
3、re表达式使用,结合beutifulsoup4更佳。
4、统计与数据挖掘。这个就看大家的需要了,筛选什么的。
#encoding=UTF-8
import urllib.request
import urllib
import os
import re
import random
import time
from _codecs import encode
import sys
def url_open_help(url,iplist):
ip=random.choice(iplist)
proxy_support=urllib.request.ProxyHandler({'http':ip})
opener=urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
req=urllib.request.Request(url)
agent=['Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/47.0','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/44.0']
req.add_header('User-Agent',random.choice(agent))
def url_open(url,iplist):
url_open_help(url,iplist)
try:
respond=urllib.request.urlopen(url)
# print("请等待3")
except:
print("请等待30s")
time.sleep(30)
url_open_help(url,iplist)
respond=urllib.request.urlopen(url)
html=respond.read()
return html
def calculate(url1,iplist):
cal_html=url_open(url1,iplist).decode('utf-8')
q=re.findall("索结果1-15 共(.*?)</div>",cal_html)
return int(q[0])
def save_raw(strr,author):
filename="D:/PHP/wamp/www/eee/eee/eee/"+author+"/raw.txt"
q=open(filename,'r+')
q.seek(0,2)
q.write(strr+'\n')
q.close()
def save_true(strr,author):
filename="D:/PHP/wamp/www/eee/eee/eee/"+author+"/true.txt"
m=open(filename,'r+')
m.seek(0,2)
m.write(strr+'\n')
m.close()
def calculate_tongji(url2,iplist,s,author):
ip=random.choice(iplist)
ip2=ip
tongji_html=url_open(url2,iplist).decode('utf-8')
str_raw=[]
str_true=[]
q=re.findall("href=\"(.*?[0-9])\/\"", tongji_html)
f=set(q)
p=0
for each in f:
str_1=[]
str_2=[]
str_3=[]
ip = random.choice(iplist)
text=url_open(each,iplist).decode('utf-8')
str_1=re.findall("property=\"v:itemreviewed\">(.*?)</span>",text)
str_2=re.findall("property=\"v:average\">(.*?)</strong>",text)
if len(str_2[0])==2:
str_2[0]=("无评分")
str_3=re.findall("property=\"v:votes\">(.*?)</span>",text)
if len(str_3)==0:
str_3.append("评价人数不足")
str_4s=re.findall("class=\"intro\">(.*?)</div>",text,re.S|re.M)
if len(str_4s)==0:
str_4s.append("<p>无内容简介</p>")
str_4=re.findall("<p>(.*?)</p>",str_4s[0],re.S|re.M)
p=p+1
str_num=str(s)+"%"+str(p)
strr=str_num+"%"+str_1[0]+"%"+str_2[0]+"%"+str_3[0]+"%"+str_4[0]
strr=strr.encode("GBK",'ignore')
strr=strr.decode("GBK",'ignore')
str_raw.append(strr)
save_raw(strr,author)
if len(re.findall("无评分",strr))==0:
str_true.append(strr)
save_true(strr,author)
print(strr)
time.sleep(random.randint(1,2))
return ip2
def find_ip(find_time,ip2):
iplist=get_ip(find_time,ip2)
# ip = random.choice(iplist)
# print(ip)
return iplist
def get_ip(find_time,ip2):
if find_time==1:
iplist=[]
ip_initial=['221.182.132.30:8000','112.95.17.95:8118','113.242.174.241:8118','103.59.178.17:80','116.30.153.22:9797']
proxy_support=urllib.request.ProxyHandler({'https':random.choice(ip_initial)})
opener=urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
elif find_time==2:
iplist=[]
proxy_support=urllib.request.ProxyHandler({'https':ip2})
opener=urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
url="http://www.kuaidaili.com/"
req=urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/47.0')
try:
ip_net=urllib.request.urlopen(url)
# print("请等待5")
except :
print("请等待50s")
time.sleep(50)
iplist=[]
ip_initial=['123.59.86.206:80','171.217.112.209:9999','123.59.86.194:80','163.125.73.182:9999','113.122.10.185:808']
proxy_support=urllib.request.ProxyHandler({'https':random.choice(ip_initial)})
opener=urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
url="http://www.kuaidaili.com/"
req=urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/47.0')
ip_net=urllib.request.urlopen(url)
ip_html=ip_net.read().decode("UTF-8",'ignore')
ip_first=re.findall("<tbody>(.*?)</tbody>",ip_html,re.S|re.M)
try:
str=ip_first[0]
except:
print("微微卡")
iplist=[]
ip_initial=['123.59.86.206:80','171.217.112.209:9999','123.59.86.194:80','163.125.73.182:9999','113.122.10.185:808']
proxy_support=urllib.request.ProxyHandler({'https':random.choice(ip_initial)})
opener=urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
url="http://www.kuaidaili.com/"
req=urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/47.0')
ip_net=urllib.request.urlopen(url)
ip_html=ip_net.read().decode("UTF-8",'ignore')
ip_first=re.findall("<tbody>(.*?)</tbody>",ip_html,re.S|re.M)
str=ip_first[0]
ip_odd=re.findall("<tr>(.*?)</tr>",str,re.S|re.M)
for each in ip_odd:
flag=re.findall("<td data-title=\"类型\">(.*?)</td>",each,re.S|re.M)
ip_address=re.findall("<td data-title=\"IP\">(.*?)</td>",each,re.S|re.M)
ip_port=re.findall("<td data-title=\"PORT\">(.*?)</td>",each,re.S|re.M)
ip=ip_address[0]+':'+ip_port[0]
if flag[0]!="HTTP":
iplist.append(ip)
return iplist
def download():
find_time=1
ip2="0"
iplist=find_ip(find_time,ip2)
author=input("请输入作者")
os.mkdir(author)
os.chdir(author)
f=open("raw.txt",'w')
f.close()
l=open("true.txt",'w')
l.close
url1='https://book.douban.com/subject_search?search_text=%E6%9D%91%E4%B8%8A%E6%98%A5%E6%A0%91&cat=1001'
books_num=calculate(url1,iplist)
find_time=2
i=books_num//15
print(i)
for s in range(i):
url2='https://book.douban.com/subject_search?start='+str(15*s)+'&search_text=%E6%9D%91%E4%B8%8A%E6%98%A5%E6%A0%91&cat=1001'
ip2=calculate_tongji(url2,iplist,s,author)
if s!=0 and s%2==0:
iplist=find_ip(find_time,ip2)
download()
仅作为测试交流,如果有非法用途,与本人无关。
谢谢慧慧,我的天使。