前言:这次的爬取内容是安居客网页里面的信息,首先是我爬取的页面是属于需要动态加载,获取到的数据是加载获得到的数据,这次做的主要是将数据存储到数据库的操作,使用跳转页面获取具体的详细数据
一、增量爬虫是什么?
增量爬虫是将需要爬取的数据保存,在已经存在的数据上面每天都有更新的数据,然而下次获取的数据就是已经跟新的数据,已有的数据变保持不变,适用于(天气预报,招聘岗位的数据更新....)
注意:使用增量数据保存到数据库的时候需要一个指标,和一个数据保存的表,需要两个数据表信息,以第一个指标来判断是否已有数据,是否进行保存
二、python数据存储到数据库
创建数据库连接对象-----》
Db=pymysql.connect(host,user,pass,databases,post)
创建游标对象------》
Cursor=db.cousor()
执行sql-------》
Cursor.excute(sql语句,[xx,xx])
提交给数据库---------》
Db.commit()
关闭游标和断开数据库——————》
Cursor.close()
Db.close()
三、多级页面的跳转获取
在安居客的案例来说,一级页面就是所有房屋信息的一页,但是需要的数据不完整,找到一级页面开发者工具里面找到可以跳转到二级页面的连接,解析数据。获取可以连接到二级页面的连接之后。再通过解析到的连接重新requests获取请求,最后再二级页面解析需要的数据。
四:遇到的问题以及解决
1:出现爬取的数据是空的没有内容
解决方法:
登陆https://m.anjuke.com/nb/sale/yinzhou/会发现是页面使用过多,需要图片验证,手动验证便可以解决
2:写入数据库的时候出现(1241, 'Operand should contain 1 column(s)')
解决:那是因为在楼层字段中,某些地区的楼层信息出现多个字段导致数据库字段和python解析字段不同。以此代码上做出了改变
item['楼层:']=two_pa.xpath("//*[@id='__layout']/div/div/div/div[4]/ul/li[2]/div/span[2]/text()")[0]
五:代码
首先自定义函数,连接使用数据库
def __init__(self):
self.db=pymysql.connect(host='localhost',user='root',password='jiuzheyang88',database='cardb',port=3306)
self.cursor=self.db.cursor()
self.get_url='https://m.anjuke.com/{}/sale/{}/'
self.headers={
#'cookie': 'sessid=B84009F2-D555-0F48-8B10-81706297B563; aQQ_ajkguid=1F1B2ED1-072D-9224-313D-09EDE23E77AC; twe=2; ANJUKE_BUCKET=pc-home%3AErshou_Web_Home_Home-a; id58=e87rkGEvHpZ9Hj8ADfJHAg==; _ga=GA1.2.1146338023.1630477975; _gid=GA1.2.2000625856.1630477975; 58tj_uuid=e1b55e99-acfc-412c-8b08-7a1d6747f0c7; init_refer=https%253A%252F%252Fwww.baidu.com%252Fother.php%253Fsc.0s0000KKO7pTJGEIyeTzo7F11JHaH2OSUkTt77V5K5QYczeOZCILTRsoICYV1R_y99rxxaC8LIb6U_ROyaPUMhrG4ys0Eq8wtM2yU5tHi41tl7daJPZwz2LhhzMk4hkxOGwNQUzh7mHbSCsUHaftSjS7tlKhL00hXz4PlhH48qx8ypAzdHe9UguUBjDP3RupZ69PxTFLCpxvr3IXTvT5sUzdw1YQ.DY_NR2Ar5Od663rj6thm_8jViBjEWXkSUSwMEukmnSrZr1wC4eL_8C5RojPak3S5Zm0.TLFWgv-b5HDkrfK1ThPGujYknHb0THY0IAYq_Q2SYeOP0ZN1ugFxIZ-suHYs0A7bgLw4TARqnsKLULFb5UazEVrO1fKzmLmqnfKdThkxpyfqnHRzrjT4PWTznsKVINqGujYkPjn4rHc1PsKVgv-b5HDknjm3P1Ts0AdYTAkxpyfqnHczP1n0TZuxpyfqn0KGuAnqHbC0TA-b5Hnz0APGujYzP1m0mLFW5HmdnHms%2526ck%253D780.1.18.232.160.237.154.359%2526dt%253D1630477969%2526wd%253D%2525E5%2525AE%252589%2525E5%2525B1%252585%2525E5%2525AE%2525A2%2526tpl%253Dtpl_12273_25897_22126%2526l%253D1528796723%2526us%253DlinkName%25253D%252525E6%252525A0%25252587%252525E9%252525A2%25252598-%252525E4%252525B8%252525BB%252525E6%252525A0%25252587%252525E9%252525A2%25252598%252526linkText%25253D%252525E5%252525AE%25252589%252525E5%252525B1%25252585%252525E5%252525AE%252525A2-%252525E5%25252585%252525A8%252525E6%25252588%252525BF%252525E6%252525BA%25252590%252525E7%252525BD%25252591%252525EF%252525BC%2525258C%252525E6%25252596%252525B0%252525E6%25252588%252525BF%25252520%252525E4%252525BA%2525258C%252525E6%25252589%2525258B%252525E6%25252588%252525BF%25252520%252525E6%2525258C%25252591%252525E5%252525A5%252525BD%252525E6%25252588%252525BF%252525E4%252525B8%2525258A%252525E5%252525AE%25252589%252525E5%252525B1%25252585%252525E5%252525AE%252525A2%252525EF%252525BC%25252581%252526linkType%25253D; new_uv=1; als=0; fzq_h=4fc1c98d01c27cf24f1e1e4aa533b49a_1630477993776_320574853b0f448aa74380bb86d54568_1880143233; ctid=32; ajk-appVersion=; new_session=0; obtain_by=2; _gat=1; xxzl_cid=b64ea507be6449278dca00e52a62de3d; xzuid=2c476d59-996e-4200-b8c1-aa9ebe14bef6; fzq_js_anjuke_ershoufang_m=93c20ae1bba4b482b2f041e7680f5938_1630479999823_28',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
主逻辑函数
def one_get_html(self):
city=input('请输入城市首字母比如(宁波)写成(nb):')
city=urllib.parse.quote(city)
pri=input('请输入地区或者街道用英文比如(鄞州)写成(yinzhou):')
pri=urllib.parse.quote(pri)#使输入的字符串转化为计算机语言使得计算机可以识别
URL=self.get_url.format(city,pri)
html=requests.get(url=URL,headers=self.headers).text#发送一级页面请求
self.one_params_html(html) #获取一级页面请求之后编写二级页面
解析一级页面,以至于,增量数据的编写写入这个模块,所谓的指标是在一级页面中的链接,如果有重复的那么程序直接退出,如果不存在,一级页面进入二级页面,如果无需保存到数据库那么可以直接删除关于指标或事数据库有关的代码(此数据库操作主要用来判断指标是否存在,是否执行二级页面已经二级页面的保存)
def one_params_html(self,html):
par_html=etree.HTML(html)
r_list=par_html.xpath(".//li[@class='item-wrap']/a/@href")
#以上两句代码是获取一级页面,并且用xpath解析一级页面需要的连接
for r in r_list:
end=r.split('/')[5]#由于此链接过长,链接中只取了重要可以区别的拿来当指标用
print(end)
md5_url=self.md_5(end)#数据指标需要加密,此函数可以见md5函数
sql="select * from house_figer where figer like(%s) "
#查询数据指标表信息是否存在,来判断是否增量
self.cursor.execute(sql,[end])#执行查询信息
resulu=self.cursor.fetchall() #查看所有查询信息
if not resulu: #如果查找的指标不存在
ins='insert into house_figer values(%s)' #指标表里面加入此信息
self.cursor.execute(ins,[end]) #执行
self.db.commit() #提交
two_html=requests.get(url=r,headers=self.headers).text
# 二级页面的获取
self.two_params_html(two_html)
#二级页面函数的使用
else:
sys.exit('ok!')
#如果查询的指标已经存在,数据库便不再增加
二级页面的解析已经保存到数据库
def two_params_html(self,two_html):
item={}
two_pa=etree.HTML(two_html)
item['名称']=two_pa.xpath(".//div[@class='baseinfo-title']/h1/text()")
item['总价格']=two_pa.xpath("//*[@id='__layout']/div/div/div/div[3]/div[2]/div[1]/span[1]/text()")
item['面积']=two_pa.xpath("//*[@id='__layout']/div/div/div/div[3]/div[2]/div[3]/span[1]/text()")
item['单价:']=two_pa.xpath("//*[@id='__layout']/div/div/div/div[4]/ul/li[1]/div/span[2]/text()")
item['楼层:']=two_pa.xpath("//*[@id='__layout']/div/div/div/div[4]/ul/li[2]/div/span[2]/text()")[0]
item['预算:']=two_pa.xpath("//*[@id='__layout']/div/div/div/div[4]/ul/li[5]/a/span[2]/text()")
inss='insert into house values(%s,%s,%s,%s,%s,%s)'
#数据表里面插入数据
#li=[item['名称'],item['总价格'],item['面积'],item['单价:'],item['楼层:'],item['预算:']]
self.cursor.execute(inss,[item['名称'],item['总价格'],item['面积'],item['单价:'],item['楼层:'],item['预算:']])
#执行数据
self.db.commit()
#提交数据
print(item,len(item))
加密
def md_5(self,url):
s=md5() #使用加密
s.update(url.encode())#将URL加密
return s.hexdigest #转化为16进制
运行主要逻辑函数,并且关闭数据库游标和数据库
def run(self):
self.one_get_html()
self.cursor.close()
self.db.close()
完整的代码
'''安居客信息抓取'''
import requests,time,re,random
import urllib.parse
from lxml import etree
import pymysql
from hashlib import md5
import sys
class DaZhong():
def __init__(self):
self.db=pymysql.connect(host='localhost',user='root',password='jiuzheyang88',database='cardb',port=3306)
self.cursor=self.db.cursor()
self.get_url='https://m.anjuke.com/{}/sale/{}/'
self.headers={
#'cookie': 'sessid=B84009F2-D555-0F48-8B10-81706297B563; aQQ_ajkguid=1F1B2ED1-072D-9224-313D-09EDE23E77AC; twe=2; ANJUKE_BUCKET=pc-home%3AErshou_Web_Home_Home-a; id58=e87rkGEvHpZ9Hj8ADfJHAg==; _ga=GA1.2.1146338023.1630477975; _gid=GA1.2.2000625856.1630477975; 58tj_uuid=e1b55e99-acfc-412c-8b08-7a1d6747f0c7; init_refer=https%253A%252F%252Fwww.baidu.com%252Fother.php%253Fsc.0s0000KKO7pTJGEIyeTzo7F11JHaH2OSUkTt77V5K5QYczeOZCILTRsoICYV1R_y99rxxaC8LIb6U_ROyaPUMhrG4ys0Eq8wtM2yU5tHi41tl7daJPZwz2LhhzMk4hkxOGwNQUzh7mHbSCsUHaftSjS7tlKhL00hXz4PlhH48qx8ypAzdHe9UguUBjDP3RupZ69PxTFLCpxvr3IXTvT5sUzdw1YQ.DY_NR2Ar5Od663rj6thm_8jViBjEWXkSUSwMEukmnSrZr1wC4eL_8C5RojPak3S5Zm0.TLFWgv-b5HDkrfK1ThPGujYknHb0THY0IAYq_Q2SYeOP0ZN1ugFxIZ-suHYs0A7bgLw4TARqnsKLULFb5UazEVrO1fKzmLmqnfKdThkxpyfqnHRzrjT4PWTznsKVINqGujYkPjn4rHc1PsKVgv-b5HDknjm3P1Ts0AdYTAkxpyfqnHczP1n0TZuxpyfqn0KGuAnqHbC0TA-b5Hnz0APGujYzP1m0mLFW5HmdnHms%2526ck%253D780.1.18.232.160.237.154.359%2526dt%253D1630477969%2526wd%253D%2525E5%2525AE%252589%2525E5%2525B1%252585%2525E5%2525AE%2525A2%2526tpl%253Dtpl_12273_25897_22126%2526l%253D1528796723%2526us%253DlinkName%25253D%252525E6%252525A0%25252587%252525E9%252525A2%25252598-%252525E4%252525B8%252525BB%252525E6%252525A0%25252587%252525E9%252525A2%25252598%252526linkText%25253D%252525E5%252525AE%25252589%252525E5%252525B1%25252585%252525E5%252525AE%252525A2-%252525E5%25252585%252525A8%252525E6%25252588%252525BF%252525E6%252525BA%25252590%252525E7%252525BD%25252591%252525EF%252525BC%2525258C%252525E6%25252596%252525B0%252525E6%25252588%252525BF%25252520%252525E4%252525BA%2525258C%252525E6%25252589%2525258B%252525E6%25252588%252525BF%25252520%252525E6%2525258C%25252591%252525E5%252525A5%252525BD%252525E6%25252588%252525BF%252525E4%252525B8%2525258A%252525E5%252525AE%25252589%252525E5%252525B1%25252585%252525E5%252525AE%252525A2%252525EF%252525BC%25252581%252526linkType%25253D; new_uv=1; als=0; fzq_h=4fc1c98d01c27cf24f1e1e4aa533b49a_1630477993776_320574853b0f448aa74380bb86d54568_1880143233; ctid=32; ajk-appVersion=; new_session=0; obtain_by=2; _gat=1; xxzl_cid=b64ea507be6449278dca00e52a62de3d; xzuid=2c476d59-996e-4200-b8c1-aa9ebe14bef6; fzq_js_anjuke_ershoufang_m=93c20ae1bba4b482b2f041e7680f5938_1630479999823_28',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
def one_get_html(self):
city=input('请输入城市首字母比如(宁波)写成(nb):')
city=urllib.parse.quote(city)
pri=input('请输入地区或者街道用英文比如(鄞州)写成(yinzhou):')
pri=urllib.parse.quote(pri)
URL=self.get_url.format(city,pri)
html=requests.get(url=URL,headers=self.headers).text
self.one_params_html(html)
def one_params_html(self,html):
par_html=etree.HTML(html)
r_list=par_html.xpath(".//li[@class='item-wrap']/a/@href")
for r in r_list:
#two_html=requests.get(url=r,headers=self.headers).text
#self.two_params_html(two_html)
end=r.split('/')[5]
print(end)
md5_url=self.md_5(end)
sql="select * from house_figer where figer like(%s) "
self.cursor.execute(sql,[end])
resulu=self.cursor.fetchall()
if not resulu:
ins='insert into house_figer values(%s)'
self.cursor.execute(ins,[end])
self.db.commit()
two_html=requests.get(url=r,headers=self.headers).text
self.two_params_html(two_html)
else:
sys.exit('ok!')
def md_5(self,url):
s=md5()
s.update(url.encode())
return s.hexdigest
def two_params_html(self,two_html):
item={}
two_pa=etree.HTML(two_html)
item['名称']=two_pa.xpath(".//div[@class='baseinfo-title']/h1/text()")
item['总价格']=two_pa.xpath("//*[@id='__layout']/div/div/div/div[3]/div[2]/div[1]/span[1]/text()")
item['面积']=two_pa.xpath("//*[@id='__layout']/div/div/div/div[3]/div[2]/div[3]/span[1]/text()")
item['单价:']=two_pa.xpath("//*[@id='__layout']/div/div/div/div[4]/ul/li[1]/div/span[2]/text()")
item['楼层:']=two_pa.xpath("//*[@id='__layout']/div/div/div/div[4]/ul/li[2]/div/span[2]/text()")[0]
item['预算:']=two_pa.xpath("//*[@id='__layout']/div/div/div/div[4]/ul/li[5]/a/span[2]/text()")
inss='insert into house values(%s,%s,%s,%s,%s,%s)'
#li=[item['名称'],item['总价格'],item['面积'],item['单价:'],item['楼层:'],item['预算:']]
self.cursor.execute(inss,[item['名称'],item['总价格'],item['面积'],item['单价:'],item['楼层:'],item['预算:']])
self.db.commit()
print(item,len(item))
def run(self):
self.one_get_html()
self.cursor.close()
self.db.close()
if __name__=='__main__':
spider=DaZhong()
spider.run()
总结
数据表信息
指标表信息
显示数据指标已经存在
表达的不知道是否清楚,要是有什么疑问可以留言,如果有说错表达不对的地方也可以留言帮我指正