利用类创建爬虫的好处
- 代码逻辑清晰,方便维护
- 功能可扩展,耦合性较小
- 方便重复调用
- …
代码实例如下:如有不足之处,敬请指正:
#coding=utf-8
# 这是一个利用正则爬取链家网的二手房位置和价格信息的简单爬虫
# 可以加入代理
import pymongo
import requests
import re
# 可扩展csv,这里就不写了
import csv
import pymysql
class LianjiaSpider():
def __init__(self):
self.pattern = r'<div\sclass="houseInfo">.*?data-el="region">(.*?)</a>.*?<div class="totalPrice">.*?<span>(.*?)</span>'
self.base_url = 'https://gz.lianjia.com/ershoufang/pg='
self.headers = {
'User-Agent':'Mozilla/5.0'
}
self.proxies = {
'http':"http://"
}
self.page = 1
#创建mysqldb连接对象和游标对象
self.db = pymysql.connect('localhost','root','123456','lianjia',charset='utf8')
self.cursor = self.db.cursor()
#创建mongodb连接对象和集合
self.conn = pymongo.MongoClient('localhost',27017)
self.db2 = self.conn['lianjia']
self.myset = self.db2['house']
def getPage(self,url):
res = requests.get(url,headers = self.headers)
res.encoding = 'utf-8'
self.html = res.text
# print(self.html)
self.parsePage()
# self.saveToMysql()
# 利用正则解析html
def parsePage(self):
regex = re.compile(self.pattern,re.S)
self.ls = regex.findall(self.html)
print(self.ls)
def saveToMysql(self):
ins = "insert into house(name,price) values ('{0}','{1}')"
for tuple in self.ls:
print(tuple)
temp = (tuple[0].strip(),int(float(tuple[1].strip())*10000))
print(temp)
self.cursor.execute(ins.format(temp[0],temp[1]))
self.db.commit()
def saveToMongo(self):
for t in self.ls:
dc = {
'name':t[0].strip(),
'price':float(t[1].strip())*10000,
}
self.myset.insert(dc)
#定义主函数,相当于这个类爬虫运行接口
def main(self):
while 1:
c = input('y/n:')
if c.lower() == "y":
url = self.base_url+str(self.page) + "/"
print(url)
#调用实例方法即可完成页面的获取,解析和储存
self.getPage(url)
self.saveToMysql()
self.saveToMongo()
self.page += 1
else:
self.cursor.close()
self.db.close()
print('欢迎再次使用')
break
if __name__ == "__main__":
spider = LianjiaSpider()
spider.main()