简要:urllib库postdata编码,urllib2网页交互。
1,抓取贴吧主题内容。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import string
import urllib2
import re
# 贴吧爬虫
class tbSpider:
def __init__(self,url):
self.myurl=url
self.datas=[]
self.mytool=htmlTool()
# 开启爬虫
def startSpider(self):
title,sumpage = self.getInfo()
self.saveDate(title,sumpage)
# 保存数据
def saveDate(self,title,sumpage):
self.getDate(sumpage)
f = open(title+".txt",'w+')
f.writelines(self.datas)
f.close()
print u"已保存数据。"
# 抓取帖子信息
def getInfo(self):
mypage = urllib2.urlopen(self.myurl).read().decode('utf-8')
title = self.titleHunter(mypage)
sumpage = self.pageCounter(mypage)
print(u"获取到标题,总页数。")
return title , sumpage
# 获取数据
def getDate(self,sumpage):
url = self.myurl + "&pn="
for i in range(1,sumpage + 1):
mypage = urllib2.urlopen(url + str(i)).read().decode('utf-8')
msgs = re.findall(r'id="post_content.*?>(.*?)</div>',mypage,re.S)
for msg in msgs:
data = self.mytool.replace(msg.encode('utf-8'))
self.datas.append(data + "\n")
# 总页数
def pageCounter(self,mypage):
match = re.search(r'class="red">(\d+?)</span>',mypage,re.S)
if match:
sumpage = int(match.group(1))
else:
sumpage = 0
print(u"找不到任何楼主的消息")
return sumpage
# 帖子标题
def titleHunter(self,mypage):
match = re.search(r'<h1.*?>(.*?)</h1>',mypage,re.S)
if match:
title = match.group(1)
else:
title="untitle"
print(u"找不到标题")
return title
# 标记、文本处理工具
class htmlTool:
#去除img标签,1-7位空格,
removeImg = re.compile('<img.*?>| {1,7}| ')
#删除超链接标签
removeAddr = re.compile('<a.*?>|</a>')
#把换行的标签换为\n
replaceLine = re.compile('<tr>|<div>|</div>|</p>')
#将表格制表<td>替换为\t
replaceTD= re.compile('<td>')
#将换行符或双换行符替换为\n
replaceBR = re.compile('<br><br>|<br>')
#将其余标签剔除
removeExtraTag = re.compile('<.*?>')
#将多行空行删除
removeNoneLine = re.compile('\n+')
def replace(self,x):
x = re.sub(self.removeImg,"",x)
x = re.sub(self.removeAddr,"",x)
x = re.sub(self.replaceLine,"\n",x)
x = re.sub(self.replaceTD,"\t",x)
x = re.sub(self.replaceBR,"\n",x)
x = re.sub(self.removeExtraTag,"",x)
x = re.sub(self.removeNoneLine,"\n",x)
#strip()将前后多余内容删除
return x.strip()
#
bdurl = 'http://tieba.baidu.com/p/' + str(raw_input(u'http://tieba.baidu.com/p/')) + "?see_lz=1"
# 生成spider并抓取数据
myspider=tbSpider(bdurl)
myspider.startSpider()
2模拟header登录163邮箱,存储cookie。
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import urllib
import urllib2
import cookielib
#
class myLog:
def __init__(self,url,username,password):
self.header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'}
self.url = url
self.username = username
self.password = password
self.cookiefile = './logcookie.dat'
self.cookie = cookielib.LWPCookieJar()
cookprocessor = urllib2.HTTPCookieProcessor(self.cookie)
opener = urllib2.build_opener(cookprocessor)
urllib2.install_opener(opener)
def startLog(self):
postdata = {
'username':self.username,
'password':self.password,
'type':'1'
}
postdata = urllib.urlencode(postdata)
request = urllib2.Request(url,postdata,self.header)
reponse = urllib2.urlopen(request).read()
self.cookie.save(self.cookiefile)
print(reponse)
# 测试实例
url='http://reg.163.com/logins.jsp?type=1&product=mail163&url=http://entry.mail.163.com/coremail/fcg/ntesdoor2?lightweight%3D1%26verifycookie%3D1%26language%3D-1%26style%3D1'
mylog = myLog(url,'***@163.com','***')
mylog.startLog()