python简单代码实现爬虫,爬数据,刷访问

python作为人工智能或者大数据的宠儿,我自然要学习,作为一个小白,第一个实现的工能就是爬虫,爬数据,收集数据,我以我爬csdn博客的事情为例子,附上代码,大家一起学习

这里还使用了ip代理基数,一起奉献了

#!/usr/bin/python
# -*- coding:utf-8 -*-
import httplib
import urllib
import json
import urllib2
import re
import os
import BeautifulSoup
import random
import time


class BaiduImage(object):
    def __init__(self):
        super(BaiduImage, self).__init__()
        print u'图片获取中,CTRL+C 退出程序...'
        self.page = 2  # 当前总页数

    def requestIp(self):
        iplist = []

        conn = httplib.HTTPConnection("www.xicidaili.com")#ip代理网站
        request_url = "/nt"
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0',
                   'Content-type': 'test/html'}

        conn.request('GET', request_url, headers=headers)

        r = conn.getresponse()
        if r.status == 200:
            #解析html
            data = r.read()
            soup = BeautifulSoup.BeautifulSoup(data)
            head = soup.find('head')
            result = soup.findAll('tr')
            for row in result[1:]:
                ipitem = IPItem()
                mylist = row.findAll('td')
                ipitem.ip = mylist[1].text
                ipitem.port = mylist[2].text
                ipitem.addr = mylist[3].text
                ipitem.tpye = mylist[5].text
                iplist.append(ipitem)

        self.request(iplist)

    def request(self, iplist):#获取要爬的csdn博客文章列表
        mylis = []
        for num in range(1, self.page):
            conn = httplib.HTTPConnection("blog.csdn.net")
            request_url = "/songyan_love/article/category/6261675/%s" % (num)
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0',
                       'Content-type': 'test/html'}

            conn.request('GET', request_url, headers=headers)

            r = conn.getresponse()
            if r.status == 200:
                #解析出里面的地址列表
                data = r.read()
                soup = BeautifulSoup.BeautifulSoup(data)
                head = soup.find('head')

                result = soup.find('div', 'list_item_new')
                list = result.findAll("div", "list_item article_item")
                for row in list:
                    result2 = row.find('span', 'link_title')
                    a = row.find('a')
                    # print a.text
                    # print a['href']
                    mylis.append(a['href'])
        # 遍历这些请求,去请求文字页面
        self.forrequest(mylis, iplist)

    def forrequest(self, mylis, iplist):
        for item in iplist[0:10]:#我这里只是举例子,我选的是ip代理中前10个
            for urllist in mylis:
                time.sleep(1)#睡眠1秒钟,如果频繁请求会报,请求报错[Errno 10060]
                params = "value=1.0.3"
                ipAddress = "%s%s%s" % (item.ip, ":", item.port)#ip代理拼接,61.155.164.109:3128
                headers = {'Host': 'blog.csdn.net',
                           'Connection': 'keep-alive',
                           'Cache-Control': ' max-age=0',
                           'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36',
                           'Upgrade-Insecure-Requests': '1',
                           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                           'Accept-Encoding': 'gzip, deflate',
                           'Accept-Language': 'zh-CN,zh;q=0.9',
                           'Cookie': 'uuid_tt_dd=10_19443020900-1513178520094-768335; gr_user_id=fc73959f-68e8-43bd-95e8-4d293c1b111e; bdshare_firstime=1513211345964; kd_user_id=6f361b81-886d-466f-b1a5-9960b742d462; _ga=GA1.2.993355856.1513212738; __utma=17226283.993355856.1513215738.1514427431.1514512093.14; __utmz=17226283.1513215738.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); UN=songyan_love; BT=1514860667627; __yadk_uid=9PFquxFHCElENSRifm5lJAqMLpVog7Ad; Hm_lvt_3f9df99a208b69b45eb52cfbe2dc3bf8=1515143143,1515375437; __message_district_code=000000; uuid=8e1ade52-fae7-48f3-b3cb-fb786e4c7afe; TY_SESSION_ID=670b7d71-fe0c-4b81-b8c5-c504fa320cdf; ADHOC_MEMBERSHIP_CLIENT_ID1.0=8cb01310-dcd7-5119-b554-5d8ed76572e9; __message_sys_msg_id=0; __message_gu_msg_id=0; __message_cnel_msg_id=0; __message_in_school=0; avh=78599641; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1515466838,1515476031,1515477617,1515479260; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1515479260; dc_tos=p29za4; dc_session_id=10_1515461736402.866875',
                           'RA-Ver': '3.0.8',
                           'RA-Sid': 'B781E81A-20150402-024118-ce25e1-ba5345',
                           'If-None-Match': 'W/"3a3ef0fe6385d9241fdcae8c24d1da08',
                           "X-Forwarded-For": ipAddress, }#X-Forwarded-For": ipAddress是为了防止被拦截掉,告诉他们我不是代理,(这是我的理解)
                print ipAddress
                con2 = httplib.HTTPConnection(ipAddress)
                try:
                    myurl = "%s%s" % ('http://blog.csdn.net', urllist)#请求文章的地址
                    print myurl
                    con2.request("HTTP", myurl, params, headers)#发送请求
                    r = con2.getresponse()
                    strString = str(r.status)#返回code
                    if strString == '':
                        print "请求失败%s" % strString
                        break
                    else:
                        print "请求成功%s" % r.status
                except Exception, e:
                    print "请求报错%s" % e
                    break


class IPItem:
    def __init__(self):
        self.ip = ''  # IP
        self.port = ''  # Port
        self.addr = ''  # 位置
        self.tpye = ''  # 类型:http; https
        self.speed = -1  # 速度


if __name__ == '__main__':
    #主方法
    bi = BaiduImage()
    bi.requestIp()
#这里的headers是我怕浏览器请求的headers的数据
希望有帮助到各位。 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值