python爬虫优化以及过程中的IndexError: list index out of range处理方法

最近在优化爬虫过程中的出现了IndexError: list index out of range。
看图,这是什么原因呢?
在这里插入图片描述
先print下you数组的长度。
在这里插入图片描述
结果不出所料,当传入数组为空或者超出时就会出现这种问题。我这里给了两种解决方案,第一种直接将有空白的地方删除。
接下来就要用到比bs4更方便的pyquery,这是一个类似于jquery库的python库。

from pyquery import PyQuery as pq

经过检查是因为有些房产信息没给建造年限,然后修改代码。

#获取房子信息
 doc=pq(html.text)
 #取奇数元素
 jli=doc('.details-item:nth-child(2n-1) span')
 #取偶数元素
 oli=doc('.details-item:nth-child(2n) span:lt(3)')
 #直接分割
 myroom=jli.text().split(' ')
 mylocal=oli.text().split(' ')
 print(mylocal)
 you=[]
 i=0
 while  i <len(myroom):
    you.append(re.sub(r'\s', "",myroom[i]))
    i=i+1
 print(you)
 mylocaldetail=[]
 myroomlocal=[]
 m=0
 k=0
 while k <60:
    myroomlocal.append([you[k*2],you[k*2+1]])
    k=k+1
 while m <60:
    mylocaldetail.append([mylocal[m*3],mylocal[m*3+1],mylocal[m*3+2]])
    m=m+1

完整的优化代码,这里将cityname做为数据库和搜索城市变量。

import requests
import re
import pymysql
from pyquery import PyQuery as pq
page=1
print('请输入你要获取的城市')
cityname= input()
while (page < 11 & len(cityname)!=0):
 print ("这是第"+str(page) +"页")
 if page==1:
  url='https://%s.anjuke.com/sale/#'%cityname
  headers={
          'referer': 'https://%s.anjuke.com/sale/#'%cityname,
          'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',

          }
 else:
  url='https://%s.anjuke.com/sale/p%d/#filtersort'%(cityname,page)
  headers={
          'referer': 'https://%s.anjuke.com/sale/p%d/'%(cityname,page),
          'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',

          }
 html = requests.get(url, headers=headers)
#图片地址
 myjpg=r'<img src="(.*?)" width="180" height="135" />'

 jpg=re.findall(myjpg,html.text)
#描述
 mytail=r'<a data-from="" data-company=""  title="(.*?)" href'
 tail=re.findall(mytail,html.text)
#获取房子信息
 doc=pq(html.text)
 jli=doc('.details-item:nth-child(2n-1) span')
 oli=doc('.details-item:nth-child(2n) span:lt(3)')
 myroom=jli.text().split(' ')
 mylocal=oli.text().split(' ')
 print(mylocal)
 you=[]
 i=0
 while  i <len(myroom):
    you.append(re.sub(r'\s', "",myroom[i]))
    i=i+1
 print(you)
 mylocaldetail=[]
 myroomlocal=[]
 m=0
 k=0
 while k <60:
    myroomlocal.append([you[k*2],you[k*2+1]])
    k=k+1
 while m <60:
    mylocaldetail.append([mylocal[m*3],mylocal[m*3+1],mylocal[m*3+2]])
    m=m+1
 #获取地理信息
# 获取总价
 print(mylocaldetail)
 totalprice=r'<span class="price-det"><strong>(.*?)</strong>'
 mytotal=re.findall(totalprice,html.text)

#单价
 simpleprice=r'<span class="unit-price">(.*?)</span> '
 simple=re.findall(simpleprice,html.text)
 db = pymysql.connect("localhost", "root", "" ,"anjuke")
 conn = db.cursor()
 print(len(jpg))
 for i in range(0,len(tail)):
    id=page*1000+i
    jpgs = jpg[i]
    scripts = tail[i]
    localroom = mylocaldetail[i][0]
    localarea=mylocaldetail[i][1]
    localhigh=mylocaldetail[i][2]
    local=myroomlocal[i][0]
    localtwo = myroomlocal[i][1]

    total = mytotal[i]
    oneprice=simple[i]
    sql = "insert into %s_admin value('%d','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % \
          (cityname,id,jpgs,scripts,local,total,oneprice,localroom,localarea,localhigh,localtwo)
    conn.execute(sql)
    db.commit()
 db.close()
 page = page + 1

第二种优化办法,是直接锁定建造时间信息,当其值为空时注入‘建造时间未知’。

#获取房子信息
 doc=pq(html.text)
 jli=doc('.details-item:nth-child(2n-1) span')
 oli=doc('.details-item:nth-child(2n)').items()
 myroom=jli.text().split(' ')
 mylocal=[]
 for li in oli:
    mylocal.append(li.text().split('|'))
 print(mylocal)
 print(len(mylocal))
 j=0
 while j <len(mylocal):
     if len(mylocal[j][3])==0:
         mylocal[j].insert(3,'建造时间未知')
     j=j+1
 you=[]
 i=0
 while  i <len(myroom):
    you.append(re.sub(r'\s', "",myroom[i]))
    i=i+1
 print(you)
 myroomlocal=[]
 k=0
 while k <60:
    myroomlocal.append([you[k*2],you[k*2+1]])
    k=k+1

最终优化完整代码。

import requests
import re
import pymysql
from pyquery import PyQuery as pq
page=1
print('请输入你要获取的城市')
cityname= input()
while (page < 11 & len(cityname)!=0):
 print ("这是第"+str(page) +"页")
 if page==1:
  url='https://%s.anjuke.com/sale/'%cityname
  headers={
          'referer': 'https://%s.anjuke.com/sale/'%cityname,
          'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',

          }
 else:
  url='https://%s.anjuke.com/sale/p%d/#filtersort'%(cityname,page)
  headers={
          'referer': 'https://%s.anjuke.com/sale/p%d/#filtersort'%(cityname,page),
          'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',

          }
 html = requests.get(url, headers=headers)
#图片地址
 myjpg=r'<img src="(.*?)" width="180" height="135" />'

 jpg=re.findall(myjpg,html.text)
#描述
 mytail=r'<a data-from="" data-company=""  title="(.*?)" href'
 tail=re.findall(mytail,html.text)
#获取房子信息
 doc=pq(html.text)
 jli=doc('.details-item:nth-child(2n-1) span')
 oli=doc('.details-item:nth-child(2n)').items()
 myroom=jli.text().split(' ')
 mylocal=[]
 for li in oli:
    mylocal.append(li.text().split('|'))
 print(mylocal)
 print(len(mylocal))
 j=0
 while j <len(mylocal):
     if len(mylocal[j][3])==0:
         mylocal[j].insert(3,'建造时间未知')
     j=j+1
 you=[]
 i=0
 while  i <len(myroom):
    you.append(re.sub(r'\s', "",myroom[i]))
    i=i+1
 print(you)
 myroomlocal=[]
 k=0
 while k <60:
    myroomlocal.append([you[k*2],you[k*2+1]])
    k=k+1

 #获取地理信息
# 获取总价

 totalprice=r'<span class="price-det"><strong>(.*?)</strong>'
 mytotal=re.findall(totalprice,html.text)

#单价
 simpleprice=r'<span class="unit-price">(.*?)</span> '
 simple=re.findall(simpleprice,html.text)
 db = pymysql.connect("localhost", "root", "" ,"anjuke")
 conn = db.cursor()
 print(len(jpg))
 for i in range(0,len(tail)):
    id=page*1000+i
    jpgs = jpg[i]
    scripts = tail[i]
    localroom = mylocal[i][0]
    localarea=mylocal[i][1]
    localhigh=mylocal[i][2]
    localtimes=mylocal[i][3]
    local=myroomlocal[i][0]
    localtwo = myroomlocal[i][1]
    total = mytotal[i]
    oneprice=simple[i]
    sql = "insert into %s_admin value('%d','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % \
          (cityname,id,jpgs,scripts,local,total,oneprice,localroom,localarea,localtimes, localhigh,localtwo)
    conn.execute(sql)
    db.commit()
 db.close()
 page = page + 1

如有错误,还望大家指正。

  • 0
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值