import urllib.request
import urllib.parse
import urllib.error
import logging
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
filename='E:\\北航大数据课程\\爬虫\\贴吧\\crawler.log',
level=logging.DEBUG)
def loadPage(url):
# try:
headers = {'Accept': 'text/html',
'User-Agent': 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'}
request = urllib.request.Request(url, headers=headers)
with urllib.request.urlopen(request) as response:
html = response.read()
return html
# except urllib.error.HTTPError as e:
# import http.server
# logging.error('HTTPError code: %s and Messages: %s' %(str(e.code),http.server.BaseHTTPRequestHandler.response[e.code]))
# logging.info('HTTPError headers: ' + str(e.headers))
# logging.info(e.read().decode('utf-8'))
# print('Error : urllib.error.HTTPError')
# except urllib.error.URLError as e:
# logging.error(e.reason)
# print('Error : urllib.error.URLError')
def writePage(html,filename):
with open(filename,'wb') as f:
f.write(html)
f.close()
print(f)
def tiebaCrawler(url,beginpage,endpage,keyword):
#再请求路径上再添加关键字
querystr = {'kw':keyword}
querystr_encode = urllib.parse.urlencode(querystr)
ul = url+querystr_encode
# 获取从beginpage到endpage之间的页面
for page in range(beginpage,endpage+1):
pn = (page-1)*50
fullurl = ul+'&pn='+str(pn)
html = loadPage(fullurl)
filename = 'E:\\北航大数据课程\\爬虫\\贴吧\\' + str(page) + '.html'
writePage(html,filename)
if __name__ == "__main__":
url = 'http://tieba.baidu.com/f?'
tiebaCrawler(url,1,20,'北京航空航天大学')
爬虫之爬取百度贴吧html页面
最新推荐文章于 2021-02-04 06:30:07 发布