利用代理服务器的简单爬虫程序

简单的爬虫,用到的知识点:
1、代理服务器
2、多线程
需要提高的:
1、分布式机制

#encoding=utf8
import os
from bs4 import BeautifulSoup
import urllib2
from multiprocessing import Pool
import random
def proxys():
    proxy_cont=open("proxy_ronng_avi.txt")
    proxy_list=proxy_cont.read().split('\n')
    proxy_cont.close()
    return proxy_list

def  opener(url,proxy):
    User_Agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
    header = {}
    header['User-Agent'] = User_Agent
    proxy_support = urllib2.ProxyHandler(proxy)  # 注册代理
    # opener = urllib2.build_opener(proxy_support,urllib2.HTTPHandler(debuglevel=1))
    opener = urllib2.build_opener(proxy_support)
    urllib2.install_opener(opener)
    req = urllib2.Request(url,headers=header)
    response=urllib2.urlopen(req,None)
    return response

def crawl_ip(page):
    print page
    User_Agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
    header = {}
    header['User-Agent'] = User_Agent

    for proxy in proxys():
        proxy={"http":proxy}
        url = 'http://www.xicidaili.com/nn/' + str(page)
        try:
            response = opener(url,proxy)
            soup = BeautifulSoup(response)
            ips = soup.find_all('tr')
        except Exception as e:
            continue

        with open("proxy.txt",'a+') as f:
            for x in range(1,len(ips)):
                ip = ips[x]
                tds = ip.find_all("td")
                ip_temp = tds[1].contents[0]+"\t"+tds[2].contents[0]+"\n"
                # print ip_temp
                # print tds[2].contents[0]+"\t"+tds[3].contents[0]
                f.write(ip_temp)
        print "final"
        break

#seq是类列表的格式
def randomSelect(seq):
    length=len(seq)
    randIndex=random.randrange(length)
    randItem=seq[randIndex]
    return randItem

def proxyOfAvi(proxy):
    ip = proxy.split('\t')[0]
    print "Now %s is running" %proxy
    flag = os.system('ping -n 3 -w 3 %s' % ip)
    if flag == 0:
        with open("proxy_avi.txt", "a+") as pro:
            pro.write(proxy + '\n')

def Topic(word):
    proxyAvi=[]
    ask_hrefs=[]
    page=1
    proxys_list = proxys()
    while page:
        if len(proxyAvi)==0:
            proxy_host=randomSelect(proxys_list)
            proxy={"http":proxy_host.split('\t')[0]}
        else:
            proxy=randomSelect(proxys_list)

        url = "http://www.rong360.com/ask/tag/" + word + "?q=&pn=" + str(page)
        try:
            response = opener(url,proxy)
            url_con = BeautifulSoup(response)

        except Exception as e:
            if proxy in proxyAvi:
                del proxyAvi[proxyAvi.index(proxy)]
            # print e
            continue

        try:
            pageflag = url_con.find("div", class_="page")
            ask_list=url_con.find("ul", class_="search_list")
            pageflag = int(pageflag.find_all('a')[-2].string)
            ask_page_herfs = [line["href"] for line in ask_list.find_all("a")]
        except Exception as e:
            continue
        else:

            print "Now %s is crawl %d  pages" % (word, page)
            proxyAvi.append(proxy)
            print "Proxy_list length is %d" %(len(proxyAvi))

            if pageflag > page:
                page += 1
            else:
                page = 0
            yield  ask_page_herfs
    #     ask_hrefs.extend(ask_page_herf)
    # return  ask_hrefs

def ansOfCont(ask_href):
        try:
            b=BeautifulSoup(urllib2.urlopen(ask_href))
            ask_title = b.find("h2",class_="title reply_question_title clearfix").text.encode('utf-8')
            ask_text=b.find_all("div", class_="reply_content")
            ask_text = [ask_title+"\t"+line.text.encode('utf-8')+"++" for line in ask_text]

        except Exception as e:
           ask_title=""
        return ask_title

def to_txt(word,ask_texts):
    '''
    ask_texts  is a structure which like list,array
    :word:  filename
    '''

    with open("%s.txt" % word.decode('utf-8'), "a+") as f:
        print ask_texts
        print type(ask_texts)
        if isinstance(ask_texts,list):
            f.writelines(ask_texts)
        else:
            f.write(ask_texts)


def crwalProxy_main():
    pagelist = range(298, 1011)
    pool = Pool(8)
    pool.map(crawl_ip, pagelist)
    pool.close()
    pool.join()

def AviProxy_main():
    proxys_list = open("proxy.txt").read().split('\n')
    pool = Pool(8)
    pool.map(proxy_Avi, proxys_list)
    pool.close()
    pool.join()

def rong_Crawl(word):
    # wordlist=["利息咨询"]
    ask_hrefs = Topic(word)
    ask_cont_list=map(ansOfCont,ask_hrefs)
    to_txt(word,ask_cont_list)

def rong_mian():
    wordlist = ['常用知识', "利息咨询", "无抵押贷", "抵押贷款", "按揭贷款", "担保咨询", '信用卡', '消费贷款', '经营贷款', '买车贷款', '买房贷款']
    pool=Pool(8)
    pool.map(rong_Crawl,wordlist)
    pool.close()
    pool.join()




if __name__=="__main__":
    rong_mian()


流程示意:
这里写图片描述

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值