一、通过正则表达式re处理数据
正则表达式规则详见:Python3 --- 正则表达式
from urllib import request,parse
import re
class Spider:
def __init__(self):
# 初始化起始页位置
self.page = 1
# 爬取开关,如果为True继续爬取
self.switch = True
pass
def loadPage(self):
"""
下载页面
:return:
"""
print("正在下载数据.....")
url = "http://www.neihan8.com/article/list_5_" + str(self.page) + ".html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"
}
req = request.Request(url,headers=headers)
res = request.urlopen(req)
html = res.read().decode('gbk')
#print(html)
pattern = re.compile('<div.*?class="f18 mb20">(.*?)</div>',re.S)
content_list = pattern.findall(html)
self.dealPage(content_list)
def dealPage(self,content_list):
"""
处理每页的段子
:return:
"""
for item in content_list:
item = item.replace("<p>","").replace("</p>","").replace("<br>","").replace("<br />","")
print("正在写入数据.....")
self.writePage(item)
def writePage(self,item):
"""
把每条段子逐个写入文件里
:return:
"""
with open("duanzi.txt","a",encoding="gbk") as f:
f.write(item)
def startWork(self):
"""
控制爬虫运行
:return:
"""
while self.switch:
command = input("如果继续爬取 ,请安回车(退出输入quit)")
if command == "quit":
self.switch = False
else:
self.loadPage()
self.page += 1
if __name__ == "__main__":
spider = Spider()
# spider.loadPage()
spider.startWork()
二、通过XPath处理数据
XPath使用详见:XPath --- 用法总结整理
import os
from urllib import request,parse
from lxml import etree
class Spider:
def __init__(self):
self.tiebaName = input("请需要访问的贴吧:")
self.beginPage = int(input("请输入起始页:"))
self.endPage = int(input("请输入终止页:"))
self.url = 'http://tieba.baidu.com/f'
self.ua_header = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"}
# 图片编号
self.userName = 1
def tiebaSpider(self):
for page in range(self.beginPage, self.endPage + 1):
pn = (page - 1) * 50 # page number
word = {'pn' : pn, 'kw': self.tiebaName}
word = request.urlencode(word) #转换成url编码格式(字符串)
myUrl = self.url + "?" + word
# 示例:http://tieba.baidu.com/f? kw=%E7%BE%8E%E5%A5%B3 & pn=50
# 调用 页面处理函数 load_Page
# 并且获取页面所有帖子链接,
links = self.loadPage(myUrl) # urllib2_test3.py
# 读取页面内容
def loadPage(self, url):
req = request.Request(url, headers = self.ua_header)
html = request.urlopen(req).read()
# 解析html 为 HTML 文档
selector=etree.HTML(html)
#抓取当前页面的所有帖子的url的后半部分,也就是帖子编号
# http://tieba.baidu.com/p/4884069807里的 “p/4884069807”
links = selector.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href')
# links 类型为 etreeElementString 列表
# 遍历列表,并且合并成一个帖子地址,调用 图片处理函数 loadImage
for link in links:
link = "http://tieba.baidu.com" + link
self.loadImages(link)
# 获取图片
def loadImages(self, link):
req = request.Request(link, headers = self.ua_header)
html = request.urlopen(req).read()
selector = etree.HTML(html)
# 获取这个帖子里所有图片的src路径
imagesLinks = selector.xpath('//img[@class="BDE_Image"]/@src')
# 依次取出图片路径,下载保存
for imagesLink in imagesLinks:
self.writeImages(imagesLink)
# 保存页面内容
def writeImages(self, imagesLink):
'''
将 images 里的二进制内容存入到 userNname 文件中
'''
print(imagesLink)
print("正在存储文件 %d ..." % self.userName)
# 1. 打开文件,返回一个文件对象
file = open('./images/' + str(self.userName) + '.png', 'wb')
# 2. 获取图片里的内容
images = request.urlopen(imagesLink).read()
# 3. 调用文件对象write() 方法,将page_html的内容写入到文件里
file.write(images)
# 4. 最后关闭文件
file.close()
# 计数器自增1
self.userName += 1
# 模拟 main 函数
if __name__ == "__main__":
# 首先创建爬虫对象
mySpider = Spider()
# 调用爬虫对象的方法,开始工作
mySpider.tiebaSpider()
三、通过BeautifulSoup4处理数据
BeautifulSoup4使用详见:Python3 --- BeautifulSoup4用法总结
from bs4 import BeautifulSoup
from urllib import request
import json # 使用了json格式存储
def tencent():
url = 'http://hr.tencent.com/'
req = request.Request(url + 'position.php?&start=10#a')
response =request.urlopen(req)
resHtml = response.read()
output =open('tencent.json','w')
html = BeautifulSoup(resHtml,'lxml')
# 创建CSS选择器
result = html.select('tr[class="even"]')
result2 = html.select('tr[class="odd"]')
result += result2
items = []
for site in result:
item = {}
name = site.select('td a')[0].get_text()
detailLink = site.select('td a')[0].attrs['href']
catalog = site.select('td')[1].get_text()
recruitNumber = site.select('td')[2].get_text()
workLocation = site.select('td')[3].get_text()
publishTime = site.select('td')[4].get_text()
item['name'] = name
item['detailLink'] = url + detailLink
item['catalog'] = catalog
item['recruitNumber'] = recruitNumber
item['publishTime'] = publishTime
items.append(item)
# 禁用ascii编码,按utf-8编码
line = json.dumps(items,ensure_ascii=False)
output.write(line)
output.close()
if __name__ == "__main__":
tencent()