python找不到reshape_Python Value Reshape问题

最新推荐文章于 2021-10-22 16:54:37 发布

weixin_39759989

最新推荐文章于 2021-10-22 16:54:37 发布

阅读量219

点赞数

文章标签： python找不到reshape

本文链接：https://blog.csdn.net/weixin_39759989/article/details/114430942

版权

我正在使用刮板 .

我根据自己的需要修改了它，代码如下：

from bs4 import BeautifulSoup as bs

from selenium import webdriver

import urllib.request, urllib.error, urllib.parse

import re

import ssl

import pandas as pd

import numpy as np

import os

chrome_options = webdriver.ChromeOptions()

chrome_options.add_argument('--headless')

chrome_options.add_argument('--no-sandbox')

chrome_options.add_argument('--disable-dev-shm-usage')

#chrome_options = webdriver.ChromeOptions()

#chrome_options.add_argument('--no-sandbox')

prefs = {'download.default_directory' : os.getcwd()}

chrome_options.add_experimental_option('prefs', prefs)

class SoupMaker():

"""

A class that scrapes indeed's Job ads

"""

def __init__(self, _url, _driver):

self.base_url = "https://www.indeed.com"

self.home_url = self.base_url + _url

self.job_links = []

self.driver = _driver

self.job_datas = []

self.job_table = []

def read_page(self):

self.ctx = ssl.create_default_context()

self.ctx.check_hostname = False

self.ctx.verify_mode = ssl.CERT_NONE

print("Parsing: ", self.home_url)

self.url = urllib.request.urlopen(self.home_url,

context = self.ctx).read()

_soup1 = bs(self.url, "html.parser")

self.a_tags = _soup1('a')

def get_job_url(self):

for link in self.a_tags:

link = link.get("href", None)

if link != None:

cmp_url = re.search("^/.+/.+/jobs/.+", link)

rc_url = re.search("^/rc.+", link)

if cmp_url or rc_url:

self.job_links.append(self.base_url + link.strip())

def get_job_info(self):

for link in self.job_links:

print(" Scraping: ", link)

self.driver.get(link)

self.driver.implicitly_wait(2750)

_soup2 = bs(self.driver.page_source, "lxml")

self.title = _soup2.find("title").get_text()

self.job_descs = _soup2.find_all('div', 'jobsearch-JobComponent-description icl-u-xs-mt--md')

self.job_origins = _soup2.find_all('div', 'jobsearch-JobMetadataFooter')

self.job_title = re.findall("(.+) - .+ - .+", self.title)[0]

self.job_location = re.findall(".+ - (.+) - .+", self.title)[0]

self.description = ''

for d in self.job_descs:

self.description += d.get_text("|", strip = True)

self.origin = re.findall("^.+ ago", self.job_origins[0].get_text())[0]

self.job_datas.append(self.job_title)

self.job_datas.append(self.job_location)

self.job_datas.append(self.description)

self.job_datas.append(self.origin)

self.x = np.array(self.job_datas).reshape((10,4))

df = pd.DataFrame(data=self.x, columns=['Job Title', 'Job Location',

'Job Description', 'Job Origin'])

return df

if __name__ == '__main__':

n = int(input("Enter no. of pages to scrape: "))

n = n*10

file_name = input("Enter CSV filename: ")

# driver = webdriver.Chrome(r"C:\chromedriver\chromedriver.exe")

#driver = webdriver.Chrome('/usr/local/bin/chromedrive')

driver = webdriver.Chrome('/usr/local/bin/chromedriver',chrome_options=chrome_options)

chrome_options=chrome_options

writer = pd.ExcelWriter('{0}.xlsx'.format(file_name), engine='xlsxwriter')

df = []

for i in range(10, n+10, 10):

#ext = "/jobs?q=&l=United+States&start={0}".format(i-10)

ext = "/jobs?l=United+States&start={0}".format(i-10)

if n == 10:

#ext = "/jobs-in-United+States"

ext ="/l-United+States-jobs.html"

s = SoupMaker(ext, driver)

s.read_page()

s.get_job_url()

df.append(s.get_job_info())

result = pd.concat(df)

result.to_excel(writer, index=False)

writer.save()

driver.close()

该脚本工作正常如果我只尝试废弃1页，但如果我试图刮掉超过10页，则会出现以下错误：

回溯(最近一次调用最后一次)：文件“file.py”，第96行，在(模块)中df.append(s.get_job_info())文件“file.py”，第71行，在get_job_info self.x = np中.array(self.job_datas).reshape((10,4))ValueError：无法将大小为0的数组重塑为形状(10,4)

如果页面的输入大于100或50，则会出现以下错误：

回溯(最近一次调用最后一次)：文件“file.py”，第100行，在df.append(s.get_job_info())文件“file.py”，第64行，在get_job_info self.job_title = re.findall( “( . ) - . - . ”，self.title)[0] IndexError：列表索引超出范围

如果有人能帮我解决这个问题，我将非常感激！提前致谢！

weixin_39759989

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫