最近在优化爬虫过程中的出现了IndexError: list index out of range。
看图,这是什么原因呢?
先print下you数组的长度。
结果不出所料,当传入数组为空或者超出时就会出现这种问题。我这里给了两种解决方案,第一种直接将有空白的地方删除。
接下来就要用到比bs4更方便的pyquery,这是一个类似于jquery库的python库。
from pyquery import PyQuery as pq
经过检查是因为有些房产信息没给建造年限,然后修改代码。
#获取房子信息
doc=pq(html.text)
#取奇数元素
jli=doc('.details-item:nth-child(2n-1) span')
#取偶数元素
oli=doc('.details-item:nth-child(2n) span:lt(3)')
#直接分割
myroom=jli.text().split(' ')
mylocal=oli.text().split(' ')
print(mylocal)
you=[]
i=0
while i <len(myroom):
you.append(re.sub(r'\s', "",myroom[i]))
i=i+1
print(you)
mylocaldetail=[]
myroomlocal=[]
m=0
k=0
while k <60:
myroomlocal.append([you[k*2],you[k*2+1]])
k=k+1
while m <60:
mylocaldetail.append([mylocal[m*3],mylocal[m*3+1],mylocal[m*3+2]])
m=m+1
完整的优化代码,这里将cityname做为数据库和搜索城市变量。
import requests
import re
import pymysql
from pyquery import PyQuery as pq
page=1
print('请输入你要获取的城市')
cityname= input()
while (page < 11 & len(cityname)!=0):
print ("这是第"+str(page) +"页")
if page==1:
url='https://%s.anjuke.com/sale/#'%cityname
headers={
'referer': 'https://%s.anjuke.com/sale/#'%cityname,
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
}
else:
url='https://%s.anjuke.com/sale/p%d/#filtersort'%(cityname,page)
headers={
'referer': 'https://%s.anjuke.com/sale/p%d/'%(cityname,page),
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
}
html = requests.get(url, headers=headers)
#图片地址
myjpg=r'<img src="(.*?)" width="180" height="135" />'
jpg=re.findall(myjpg,html.text)
#描述
mytail=r'<a data-from="" data-company="" title="(.*?)" href'
tail=re.findall(mytail,html.text)
#获取房子信息
doc=pq(html.text)
jli=doc('.details-item:nth-child(2n-1) span')
oli=doc('.details-item:nth-child(2n) span:lt(3)')
myroom=jli.text().split(' ')
mylocal=oli.text().split(' ')
print(mylocal)
you=[]
i=0
while i <len(myroom):
you.append(re.sub(r'\s', "",myroom[i]))
i=i+1
print(you)
mylocaldetail=[]
myroomlocal=[]
m=0
k=0
while k <60:
myroomlocal.append([you[k*2],you[k*2+1]])
k=k+1
while m <60:
mylocaldetail.append([mylocal[m*3],mylocal[m*3+1],mylocal[m*3+2]])
m=m+1
#获取地理信息
# 获取总价
print(mylocaldetail)
totalprice=r'<span class="price-det"><strong>(.*?)</strong>'
mytotal=re.findall(totalprice,html.text)
#单价
simpleprice=r'<span class="unit-price">(.*?)</span> '
simple=re.findall(simpleprice,html.text)
db = pymysql.connect("localhost", "root", "" ,"anjuke")
conn = db.cursor()
print(len(jpg))
for i in range(0,len(tail)):
id=page*1000+i
jpgs = jpg[i]
scripts = tail[i]
localroom = mylocaldetail[i][0]
localarea=mylocaldetail[i][1]
localhigh=mylocaldetail[i][2]
local=myroomlocal[i][0]
localtwo = myroomlocal[i][1]
total = mytotal[i]
oneprice=simple[i]
sql = "insert into %s_admin value('%d','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % \
(cityname,id,jpgs,scripts,local,total,oneprice,localroom,localarea,localhigh,localtwo)
conn.execute(sql)
db.commit()
db.close()
page = page + 1
第二种优化办法,是直接锁定建造时间信息,当其值为空时注入‘建造时间未知’。
#获取房子信息
doc=pq(html.text)
jli=doc('.details-item:nth-child(2n-1) span')
oli=doc('.details-item:nth-child(2n)').items()
myroom=jli.text().split(' ')
mylocal=[]
for li in oli:
mylocal.append(li.text().split('|'))
print(mylocal)
print(len(mylocal))
j=0
while j <len(mylocal):
if len(mylocal[j][3])==0:
mylocal[j].insert(3,'建造时间未知')
j=j+1
you=[]
i=0
while i <len(myroom):
you.append(re.sub(r'\s', "",myroom[i]))
i=i+1
print(you)
myroomlocal=[]
k=0
while k <60:
myroomlocal.append([you[k*2],you[k*2+1]])
k=k+1
最终优化完整代码。
import requests
import re
import pymysql
from pyquery import PyQuery as pq
page=1
print('请输入你要获取的城市')
cityname= input()
while (page < 11 & len(cityname)!=0):
print ("这是第"+str(page) +"页")
if page==1:
url='https://%s.anjuke.com/sale/'%cityname
headers={
'referer': 'https://%s.anjuke.com/sale/'%cityname,
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
}
else:
url='https://%s.anjuke.com/sale/p%d/#filtersort'%(cityname,page)
headers={
'referer': 'https://%s.anjuke.com/sale/p%d/#filtersort'%(cityname,page),
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
}
html = requests.get(url, headers=headers)
#图片地址
myjpg=r'<img src="(.*?)" width="180" height="135" />'
jpg=re.findall(myjpg,html.text)
#描述
mytail=r'<a data-from="" data-company="" title="(.*?)" href'
tail=re.findall(mytail,html.text)
#获取房子信息
doc=pq(html.text)
jli=doc('.details-item:nth-child(2n-1) span')
oli=doc('.details-item:nth-child(2n)').items()
myroom=jli.text().split(' ')
mylocal=[]
for li in oli:
mylocal.append(li.text().split('|'))
print(mylocal)
print(len(mylocal))
j=0
while j <len(mylocal):
if len(mylocal[j][3])==0:
mylocal[j].insert(3,'建造时间未知')
j=j+1
you=[]
i=0
while i <len(myroom):
you.append(re.sub(r'\s', "",myroom[i]))
i=i+1
print(you)
myroomlocal=[]
k=0
while k <60:
myroomlocal.append([you[k*2],you[k*2+1]])
k=k+1
#获取地理信息
# 获取总价
totalprice=r'<span class="price-det"><strong>(.*?)</strong>'
mytotal=re.findall(totalprice,html.text)
#单价
simpleprice=r'<span class="unit-price">(.*?)</span> '
simple=re.findall(simpleprice,html.text)
db = pymysql.connect("localhost", "root", "" ,"anjuke")
conn = db.cursor()
print(len(jpg))
for i in range(0,len(tail)):
id=page*1000+i
jpgs = jpg[i]
scripts = tail[i]
localroom = mylocal[i][0]
localarea=mylocal[i][1]
localhigh=mylocal[i][2]
localtimes=mylocal[i][3]
local=myroomlocal[i][0]
localtwo = myroomlocal[i][1]
total = mytotal[i]
oneprice=simple[i]
sql = "insert into %s_admin value('%d','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % \
(cityname,id,jpgs,scripts,local,total,oneprice,localroom,localarea,localtimes, localhigh,localtwo)
conn.execute(sql)
db.commit()
db.close()
page = page + 1
如有错误,还望大家指正。