Python豆瓣爬虫

最新推荐文章于 2024-08-04 17:55:11 发布

BUAA-XX

最新推荐文章于 2024-08-04 17:55:11 发布

阅读量995

点赞数 1

分类专栏： Python，爬虫，Web

本文链接：https://blog.csdn.net/sinat_33829806/article/details/54558143

版权

Python，爬虫，Web 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

豆瓣爬虫代码

这里是爬去豆瓣图书的，修改一下爬什么电影/音乐应该很easy

架构：

1、urlib request使用，浏览器header伪装（随机从不同header选择），汉字代码解码

我认为本人做得比较好的

2、ip地址爬取，豆瓣查ip访问统计的，同一ip访问次数多就封掉，因此，我们使用代理。为了不被封，我们使用不同的代理ip，但是如何获得？我们去爬ip，用爬到的ip再去爬ip。如此循环，完美。

3、re表达式使用，结合beutifulsoup4更佳。

4、统计与数据挖掘。这个就看大家的需要了，筛选什么的。

#encoding=UTF-8
import urllib.request
import urllib
import os
import re
import random
import time
from _codecs import encode
import sys
def url_open_help(url,iplist):
    ip=random.choice(iplist)
    proxy_support=urllib.request.ProxyHandler({'http':ip})
    opener=urllib.request.build_opener(proxy_support)
    urllib.request.install_opener(opener)
    req=urllib.request.Request(url)
    agent=['Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/47.0','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/44.0']
    req.add_header('User-Agent',random.choice(agent))

def url_open(url,iplist):
    url_open_help(url,iplist)
    try:
        respond=urllib.request.urlopen(url)
#         print("请等待3")
    except:
        print("请等待30s")
        time.sleep(30)
        url_open_help(url,iplist)
        respond=urllib.request.urlopen(url)
    html=respond.read()
    return html

def calculate(url1,iplist):
    cal_html=url_open(url1,iplist).decode('utf-8')
    q=re.findall("索结果1-15 &nbsp; 共(.*?)</div>",cal_html)
    return int(q[0]) 

def save_raw(strr,author):
    filename="D:/PHP/wamp/www/eee/eee/eee/"+author+"/raw.txt"
    q=open(filename,'r+')
    q.seek(0,2)
    q.write(strr+'\n')
    q.close()

def save_true(strr,author): 
    filename="D:/PHP/wamp/www/eee/eee/eee/"+author+"/true.txt"  
    m=open(filename,'r+')
    m.seek(0,2)
    m.write(strr+'\n')
    m.close()     

def calculate_tongji(url2,iplist,s,author):
    ip=random.choice(iplist)
    ip2=ip
    tongji_html=url_open(url2,iplist).decode('utf-8')
    str_raw=[]
    str_true=[]
    q=re.findall("href=\"(.*?[0-9])\/\"", tongji_html)
    f=set(q)
    p=0
    for each in f:
        str_1=[]
        str_2=[]
        str_3=[]
        ip = random.choice(iplist)
        text=url_open(each,iplist).decode('utf-8')
        str_1=re.findall("property=\"v:itemreviewed\">(.*?)</span>",text)
        str_2=re.findall("property=\"v:average\">(.*?)</strong>",text)
        if len(str_2[0])==2:
            str_2[0]=("无评分")
        str_3=re.findall("property=\"v:votes\">(.*?)</span>",text)
        if len(str_3)==0:
            str_3.append("评价人数不足")
        str_4s=re.findall("class=\"intro\">(.*?)</div>",text,re.S|re.M)
        if len(str_4s)==0:
            str_4s.append("<p>无内容简介</p>")
        str_4=re.findall("<p>(.*?)</p>",str_4s[0],re.S|re.M)
        p=p+1
        str_num=str(s)+"%"+str(p)
        strr=str_num+"%"+str_1[0]+"%"+str_2[0]+"%"+str_3[0]+"%"+str_4[0]
        strr=strr.encode("GBK",'ignore')
        strr=strr.decode("GBK",'ignore')
        str_raw.append(strr)
        save_raw(strr,author)
        if len(re.findall("无评分",strr))==0:
            str_true.append(strr)
            save_true(strr,author)       
        print(strr)
        time.sleep(random.randint(1,2))   
    return ip2   
def find_ip(find_time,ip2):
    iplist=get_ip(find_time,ip2)
#     ip = random.choice(iplist)
#     print(ip)
    return iplist    


def get_ip(find_time,ip2):
    if find_time==1:
        iplist=[]
        ip_initial=['221.182.132.30:8000','112.95.17.95:8118','113.242.174.241:8118','103.59.178.17:80','116.30.153.22:9797']
        proxy_support=urllib.request.ProxyHandler({'https':random.choice(ip_initial)})
        opener=urllib.request.build_opener(proxy_support)
        urllib.request.install_opener(opener)  
    elif find_time==2:
        iplist=[]
        proxy_support=urllib.request.ProxyHandler({'https':ip2})
        opener=urllib.request.build_opener(proxy_support)
        urllib.request.install_opener(opener)
    url="http://www.kuaidaili.com/"
    req=urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/47.0')
    try:
        ip_net=urllib.request.urlopen(url)
#         print("请等待5")
    except :
        print("请等待50s")
        time.sleep(50)
        iplist=[]
        ip_initial=['123.59.86.206:80','171.217.112.209:9999','123.59.86.194:80','163.125.73.182:9999','113.122.10.185:808']
        proxy_support=urllib.request.ProxyHandler({'https':random.choice(ip_initial)})
        opener=urllib.request.build_opener(proxy_support)
        urllib.request.install_opener(opener)
        url="http://www.kuaidaili.com/"
        req=urllib.request.Request(url)
        req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/47.0')
        ip_net=urllib.request.urlopen(url) 
    ip_html=ip_net.read().decode("UTF-8",'ignore') 
    ip_first=re.findall("<tbody>(.*?)</tbody>",ip_html,re.S|re.M)
    try:
        str=ip_first[0]
    except:
        print("微微卡")
        iplist=[]
        ip_initial=['123.59.86.206:80','171.217.112.209:9999','123.59.86.194:80','163.125.73.182:9999','113.122.10.185:808']
        proxy_support=urllib.request.ProxyHandler({'https':random.choice(ip_initial)})
        opener=urllib.request.build_opener(proxy_support)
        urllib.request.install_opener(opener)
        url="http://www.kuaidaili.com/"
        req=urllib.request.Request(url)
        req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/47.0')
        ip_net=urllib.request.urlopen(url)
        ip_html=ip_net.read().decode("UTF-8",'ignore') 
        ip_first=re.findall("<tbody>(.*?)</tbody>",ip_html,re.S|re.M)
        str=ip_first[0]
    ip_odd=re.findall("<tr>(.*?)</tr>",str,re.S|re.M)
    for each in ip_odd:
        flag=re.findall("<td data-title=\"类型\">(.*?)</td>",each,re.S|re.M)
        ip_address=re.findall("<td data-title=\"IP\">(.*?)</td>",each,re.S|re.M)
        ip_port=re.findall("<td data-title=\"PORT\">(.*?)</td>",each,re.S|re.M)
        ip=ip_address[0]+':'+ip_port[0]
        if flag[0]!="HTTP":
            iplist.append(ip)          
    return iplist

def download():
    find_time=1
    ip2="0"
    iplist=find_ip(find_time,ip2)
    author=input("请输入作者")
    os.mkdir(author)
    os.chdir(author)
    f=open("raw.txt",'w')
    f.close()
    l=open("true.txt",'w')
    l.close
    url1='https://book.douban.com/subject_search?search_text=%E6%9D%91%E4%B8%8A%E6%98%A5%E6%A0%91&cat=1001'
    books_num=calculate(url1,iplist)
    find_time=2
    i=books_num//15
    print(i)
    for s in range(i):        
        url2='https://book.douban.com/subject_search?start='+str(15*s)+'&search_text=%E6%9D%91%E4%B8%8A%E6%98%A5%E6%A0%91&cat=1001'
        ip2=calculate_tongji(url2,iplist,s,author) 
        if s!=0 and s%2==0:
            iplist=find_ip(find_time,ip2) 

download()

仅作为测试交流，如果有非法用途，与本人无关。

谢谢慧慧，我的天使。

BUAA-XX

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
Python豆瓣爬虫

豆瓣爬虫代码这里是爬去豆瓣图书的，修改一下爬什么电影/音乐应该很easy架构：1、urlib request使用，浏览器header伪装（随机从不同header选择），汉字代码解码我认为本人做得比较好的2、ip地址爬取，豆瓣查ip访问统计的，同一ip访问次数多就封掉，因此，我们使用代理。为了不被封，我们使用不同的代理ip，但是如何获得？我们去爬ip，用爬到的ip再去爬ip。如此循环，完美。3、re表
复制链接

扫一扫

专栏目录