1. 自定义抓取类
#!/usr/bin/python3 # coding=utf-8 import gzip, re import http.cookiejar import urllib.request import urllib.parse class Spider: postData = '' headers = { 'Connection': 'Keep-Alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8', 'User-Agent': 'Mozilla/5.0 (windows NT 6.3; WOW64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36', 'Accept-Encoding': 'gzip,deflate', 'Host': 'domain.com' } # 初始化 def __init__(self): cj = http.cookiejar.CookieJar() pro = urllib.request.HTTPCookieProcessor(cj) opener = urllib.request.build_opener(pro) header = [] for key, value in self.headers.items(): elem = (key, value) header.append(elem) opener.addHeaders = header urllib.request.install_opener(opener) # 模拟登陆 def login(self, login_url): req = urllib.request.Request(login_url, self.postData) resp = urllib.request.urlopen(req) data = resp.read() data = self._ungzip(data) data = data.decode() return data # 抓取登陆之后的某一页面 def getContents(self, page_url): req = urllib.request.Request(page_url) resp = urllib.request.urlopen(req) data = resp.read() data = self._ungzip(data) data = data.decode() return data # 解压 def _ungzip(self, data): try: data = gzip.decompress(data) except: print('') return data
2. 抓取
sp = Spider() // user, pass是form表单的字段 sp.postData = urllib.parse.urlencode({'user':'username','pass':'password'}).encode() domain = 'http://abc.com' login_url = domain + '/login' login = sp.login(login_url) page_url = domain + '/test.html' contents = sp.getContents(page_url) print(contents)