python爬虫

简要:urllib库postdata编码,urllib2网页交互。
1,抓取贴吧主题内容。

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import string
import urllib2
import re

# 贴吧爬虫
class tbSpider:
    def __init__(self,url):
        self.myurl=url
        self.datas=[]
        self.mytool=htmlTool()

    # 开启爬虫
    def startSpider(self):
        title,sumpage = self.getInfo()
        self.saveDate(title,sumpage)

    # 保存数据
    def saveDate(self,title,sumpage):
        self.getDate(sumpage)
        f = open(title+".txt",'w+')
        f.writelines(self.datas)
        f.close()
        print u"已保存数据。"

    # 抓取帖子信息
    def getInfo(self):
        mypage = urllib2.urlopen(self.myurl).read().decode('utf-8')
        title = self.titleHunter(mypage)
        sumpage = self.pageCounter(mypage)
        print(u"获取到标题,总页数。")
        return title , sumpage

    # 获取数据
    def getDate(self,sumpage):
        url = self.myurl + "&pn="
        for i in range(1,sumpage + 1):
            mypage = urllib2.urlopen(url + str(i)).read().decode('utf-8')
            msgs = re.findall(r'id="post_content.*?>(.*?)</div>',mypage,re.S)
            for msg in msgs:
                data = self.mytool.replace(msg.encode('utf-8'))
                self.datas.append(data + "\n")

    # 总页数
    def pageCounter(self,mypage):
        match = re.search(r'class="red">(\d+?)</span>',mypage,re.S)
        if match:
            sumpage = int(match.group(1))
        else:
            sumpage = 0
            print(u"找不到任何楼主的消息")
        return sumpage

    # 帖子标题
    def titleHunter(self,mypage):
        match = re.search(r'<h1.*?>(.*?)</h1>',mypage,re.S)
        if match:
            title = match.group(1)
        else:
            title="untitle"
            print(u"找不到标题")
        return title

# 标记、文本处理工具
class htmlTool:
    #去除img标签,1-7位空格,&nbsp;
    removeImg = re.compile('<img.*?>| {1,7}|&nbsp;')
    #删除超链接标签
    removeAddr = re.compile('<a.*?>|</a>')
    #把换行的标签换为\n
    replaceLine = re.compile('<tr>|<div>|</div>|</p>')
    #将表格制表<td>替换为\t
    replaceTD= re.compile('<td>')
    #将换行符或双换行符替换为\n
    replaceBR = re.compile('<br><br>|<br>')
    #将其余标签剔除
    removeExtraTag = re.compile('<.*?>')
    #将多行空行删除
    removeNoneLine = re.compile('\n+')
    def replace(self,x):
        x = re.sub(self.removeImg,"",x)
        x = re.sub(self.removeAddr,"",x)
        x = re.sub(self.replaceLine,"\n",x)
        x = re.sub(self.replaceTD,"\t",x)
        x = re.sub(self.replaceBR,"\n",x)
        x = re.sub(self.removeExtraTag,"",x)
        x = re.sub(self.removeNoneLine,"\n",x)
        #strip()将前后多余内容删除
        return x.strip()
# 
bdurl = 'http://tieba.baidu.com/p/' + str(raw_input(u'http://tieba.baidu.com/p/')) + "?see_lz=1"

# 生成spider并抓取数据
myspider=tbSpider(bdurl)
myspider.startSpider()

2模拟header登录163邮箱,存储cookie。

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import urllib
import urllib2
import cookielib

#
class myLog:
    def __init__(self,url,username,password):
        self.header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'}
        self.url = url
        self.username = username
        self.password = password
        self.cookiefile = './logcookie.dat'
        self.cookie = cookielib.LWPCookieJar()
        cookprocessor = urllib2.HTTPCookieProcessor(self.cookie)
        opener = urllib2.build_opener(cookprocessor)
        urllib2.install_opener(opener)

    def startLog(self):
        postdata = {
            'username':self.username,
            'password':self.password,
            'type':'1'
            }
        postdata = urllib.urlencode(postdata)
        request = urllib2.Request(url,postdata,self.header)
        reponse = urllib2.urlopen(request).read()
        self.cookie.save(self.cookiefile)
        print(reponse)

# 测试实例
url='http://reg.163.com/logins.jsp?type=1&product=mail163&url=http://entry.mail.163.com/coremail/fcg/ntesdoor2?lightweight%3D1%26verifycookie%3D1%26language%3D-1%26style%3D1'
mylog = myLog(url,'***@163.com','***')
mylog.startLog()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值