# _*_ coding:utf-8 _*
import urllib2
def load_page(url):
user_agent="Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"
headers= {"User-Agent":user_agent}
req =urllib2.Request(url,headers=headers)
response = urllib2.urlopen(req)
html=response.read()
return html
def write_to_file(file_name,txt):
'''
将txt文本 存入到file_name 文件中
'''
print "正在写入文件" + file_name
#1.打开文件
f =open(file_name,'w')
#2.读写文件
f.write(txt)
#3.关闭文件
f.close()
def tieba_spider(url,begin_page,end_page):
'''
贴吧方法
'''
for i in range(begin_page,end_page + 1):
pn = 50 * (i-1)
my_url=url+str(pn)
#print"请求的地址"
#print my_url
html=load_page(my_url)
#print "------第%页--------"%(i)
#print html
#print "------------"
file_name=str(i)+".html"
write_to_file(file_name,html)
#main
if __name__=="__main__":
url=raw_input("请输入贴吧地址")
print url
begin_page=int(raw_input("请输入起始地址"))
end_page=int(raw_input("请输入结束地址"))
tieba_spider(url,begin_page,end_page)
python实现爬取贴吧网站网页
最新推荐文章于 2019-12-20 12:47:14 发布