我正在使用刮板 .
我根据自己的需要修改了它,代码如下:
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import urllib.request, urllib.error, urllib.parse
import re
import ssl
import pandas as pd
import numpy as np
import os
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
#chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('--no-sandbox')
prefs = {'download.default_directory' : os.getcwd()}
chrome_options.add_experimental_option('prefs', prefs)
class SoupMaker():
"""
A class that scrapes indeed's Job ads
"""
def __init__(self, _url, _driver):
self.base_url = "https://www.indeed.com"
self.home_url = self.base_url + _url
self.job_links = []
self.driver = _driver
self.job_datas = []
self.job_table = []
def read_page(self):
self.ctx = ssl.create_default_context()
self.ctx.check_hostname = False
self.ctx.verify_mode = ssl.CERT_NONE
print("Parsing: ", self.home_url)
self.url = urllib.request.urlopen(self.home_url,
context = self.ctx).read()
_soup1 = bs(self.url, "html.parser")
self.a_tags = _soup1('a')
def get_job_url(self):
for link in self.a_tags:
link = link.get("href", None)
if link != None:
cmp_url = re.search("^/.+/.+/jobs/.+", link)
rc_url = re.search("^/rc.+", link)
if cmp_url or rc_url:
self.job_links.append(self.base_url + link.strip())
def get_job_info(self):
for link in self.job_links:
print(" Scraping: ", link)
self.driver.get(link)
self.driver.implicitly_wait(2750)
_soup2 = bs(self.driver.page_source, "lxml")
self.title = _soup2.find("title").get_text()
self.job_descs = _soup2.find_all('div', 'jobsearch-JobComponent-description icl-u-xs-mt--md')
self.job_origins = _soup2.find_all('div', 'jobsearch-JobMetadataFooter')
self.job_title = re.findall("(.+) - .+ - .+", self.title)[0]
self.job_location = re.findall(".+ - (.+) - .+", self.title)[0]
self.description = ''
for d in self.job_descs:
self.description += d.get_text("|", strip = True)
self.origin = re.findall("^.+ ago", self.job_origins[0].get_text())[0]
self.job_datas.append(self.job_title)
self.job_datas.append(self.job_location)
self.job_datas.append(self.description)
self.job_datas.append(self.origin)
self.x = np.array(self.job_datas).reshape((10,4))
df = pd.DataFrame(data=self.x, columns=['Job Title', 'Job Location',
'Job Description', 'Job Origin'])
return df
if __name__ == '__main__':
n = int(input("Enter no. of pages to scrape: "))
n = n*10
file_name = input("Enter CSV filename: ")
# driver = webdriver.Chrome(r"C:\chromedriver\chromedriver.exe")
#driver = webdriver.Chrome('/usr/local/bin/chromedrive')
driver = webdriver.Chrome('/usr/local/bin/chromedriver',chrome_options=chrome_options)
chrome_options=chrome_options
writer = pd.ExcelWriter('{0}.xlsx'.format(file_name), engine='xlsxwriter')
df = []
for i in range(10, n+10, 10):
#ext = "/jobs?q=&l=United+States&start={0}".format(i-10)
ext = "/jobs?l=United+States&start={0}".format(i-10)
if n == 10:
#ext = "/jobs-in-United+States"
ext ="/l-United+States-jobs.html"
s = SoupMaker(ext, driver)
s.read_page()
s.get_job_url()
df.append(s.get_job_info())
result = pd.concat(df)
result.to_excel(writer, index=False)
writer.save()
driver.close()
该脚本工作正常如果我只尝试废弃1页,但如果我试图刮掉超过10页,则会出现以下错误:
回溯(最近一次调用最后一次):文件“file.py”,第96行,在(模块)中df.append(s.get_job_info())文件“file.py”,第71行,在get_job_info self.x = np中.array(self.job_datas).reshape((10,4))ValueError:无法将大小为0的数组重塑为形状(10,4)
如果页面的输入大于100或50,则会出现以下错误:
回溯(最近一次调用最后一次):文件“file.py”,第100行,在df.append(s.get_job_info())文件“file.py”,第64行,在get_job_info self.job_title = re.findall( “( . ) - . - . ”,self.title)[0] IndexError:列表索引超出范围
如果有人能帮我解决这个问题,我将非常感激!提前致谢!