python找不到reshape_Python Value Reshape问题

我正在使用刮板 .

我根据自己的需要修改了它,代码如下:

from bs4 import BeautifulSoup as bs

from selenium import webdriver

import urllib.request, urllib.error, urllib.parse

import re

import ssl

import pandas as pd

import numpy as np

import os

chrome_options = webdriver.ChromeOptions()

chrome_options.add_argument('--headless')

chrome_options.add_argument('--no-sandbox')

chrome_options.add_argument('--disable-dev-shm-usage')

#chrome_options = webdriver.ChromeOptions()

#chrome_options.add_argument('--no-sandbox')

prefs = {'download.default_directory' : os.getcwd()}

chrome_options.add_experimental_option('prefs', prefs)

class SoupMaker():

"""

A class that scrapes indeed's Job ads

"""

def __init__(self, _url, _driver):

self.base_url = "https://www.indeed.com"

self.home_url = self.base_url + _url

self.job_links = []

self.driver = _driver

self.job_datas = []

self.job_table = []

def read_page(self):

self.ctx = ssl.create_default_context()

self.ctx.check_hostname = False

self.ctx.verify_mode = ssl.CERT_NONE

print("Parsing: ", self.home_url)

self.url = urllib.request.urlopen(self.home_url,

context = self.ctx).read()

_soup1 = bs(self.url, "html.parser")

self.a_tags = _soup1('a')

def get_job_url(self):

for link in self.a_tags:

link = link.get("href", None)

if link != None:

cmp_url = re.search("^/.+/.+/jobs/.+", link)

rc_url = re.search("^/rc.+", link)

if cmp_url or rc_url:

self.job_links.append(self.base_url + link.strip())

def get_job_info(self):

for link in self.job_links:

print(" Scraping: ", link)

self.driver.get(link)

self.driver.implicitly_wait(2750)

_soup2 = bs(self.driver.page_source, "lxml")

self.title = _soup2.find("title").get_text()

self.job_descs = _soup2.find_all('div', 'jobsearch-JobComponent-description icl-u-xs-mt--md')

self.job_origins = _soup2.find_all('div', 'jobsearch-JobMetadataFooter')

self.job_title = re.findall("(.+) - .+ - .+", self.title)[0]

self.job_location = re.findall(".+ - (.+) - .+", self.title)[0]

self.description = ''

for d in self.job_descs:

self.description += d.get_text("|", strip = True)

self.origin = re.findall("^.+ ago", self.job_origins[0].get_text())[0]

self.job_datas.append(self.job_title)

self.job_datas.append(self.job_location)

self.job_datas.append(self.description)

self.job_datas.append(self.origin)

self.x = np.array(self.job_datas).reshape((10,4))

df = pd.DataFrame(data=self.x, columns=['Job Title', 'Job Location',

'Job Description', 'Job Origin'])

return df

if __name__ == '__main__':

n = int(input("Enter no. of pages to scrape: "))

n = n*10

file_name = input("Enter CSV filename: ")

# driver = webdriver.Chrome(r"C:\chromedriver\chromedriver.exe")

#driver = webdriver.Chrome('/usr/local/bin/chromedrive')

driver = webdriver.Chrome('/usr/local/bin/chromedriver',chrome_options=chrome_options)

chrome_options=chrome_options

writer = pd.ExcelWriter('{0}.xlsx'.format(file_name), engine='xlsxwriter')

df = []

for i in range(10, n+10, 10):

#ext = "/jobs?q=&l=United+States&start={0}".format(i-10)

ext = "/jobs?l=United+States&start={0}".format(i-10)

if n == 10:

#ext = "/jobs-in-United+States"

ext ="/l-United+States-jobs.html"

s = SoupMaker(ext, driver)

s.read_page()

s.get_job_url()

df.append(s.get_job_info())

result = pd.concat(df)

result.to_excel(writer, index=False)

writer.save()

driver.close()

该脚本工作正常如果我只尝试废弃1页,但如果我试图刮掉超过10页,则会出现以下错误:

回溯(最近一次调用最后一次):文件“file.py”,第96行,在(模块)中df.append(s.get_job_info())文件“file.py”,第71行,在get_job_info self.x = np中.array(self.job_datas).reshape((10,4))ValueError:无法将大小为0的数组重塑为形状(10,4)

如果页面的输入大于100或50,则会出现以下错误:

回溯(最近一次调用最后一次):文件“file.py”,第100行,在df.append(s.get_job_info())文件“file.py”,第64行,在get_job_info self.job_title = re.findall( “( . ) - . - . ”,self.title)[0] IndexError:列表索引超出范围

如果有人能帮我解决这个问题,我将非常感激!提前致谢!

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值