#-*- coding:utf-8 -*-
import urllib
from urllib
import parse,request
def writePage(html,filename):
'''
将获取的内容写入文件
'''
print(
"keep file....")
print(
type(html)) #这里的html类型是bytes 写文件的时候要设置类型是wb+
with
open(filename,
"wb+")
as f: f.write(html)
print(
"-"*
30)
def loadPage(url,
filename):
'''
获取url地址的内容
'''
print(
"loding.....")
print(url) un_headers = {
"User-Agent" :
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}
#创建一个url对象 并模拟浏览器 a_request = urllib.request.Request(url,
headers=un_headers)
#获取url地址的内容
return urllib.request.urlopen(a_request).read()
def tiebaSpider(url,beginPage,endPage):
'''
根据网页特性拼接url
'''
for page
in
range(beginPage, endPage +
1): pn = (page -
1) *
50
filename =
"page"+
str(page)+
".html"
fullurl = url +
"&pn=" +
str(pn)
print(
" fullurl [%s]"%fullurl) html = loadPage(fullurl,filename) writePage(html,filename)
print(
"thanks......")
if __name__ ==
'__main__': kw =
input(
"请输入需要爬去的贴吧名:") beginPage =
int(
input(
"请输入起始页:")) endPage =
int(
input(
"请输入结束页")) url =
"http://tieba.baidu.com/f?" #这里url前需要加http;//否则或报错 raise ValueError("unknown url type: %r" % self.full_url)
#对要查找的内容进行编码urlencode()参数是字典类型
key = urllib.parse.urlencode({
"kw":kw}) fullurl = url + key
print(fullurl) tiebaSpider(fullurl,beginPage,endPage)