本次记录,不详述原理,直接上代码。
github详细代码
本项目开发环境:
selenium + firefox + firefox驱动 + pycharm + qtdesigner
selenium模拟操作拉勾获取网页数据
部分代码如下:
def lagou_search_key(keyword, main_browser, wait, url=lagou_url):
print("正在搜索:" + keyword)
main_browser.get(url)
try:
btn = wait.until(
EC.element_to_be_clickable((By.LINK_TEXT, '深圳站'))
)
btn.click()
except Exception:
pass
try:
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#search_input'))
)
btn_search = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#search_button'))
)
input.send_keys(keyword)
time.sleep(2)
btn_search.click()
# handle = main_browser.current_window_handle
# handles = main_browser.window_handles
# if len(handles) > 1:
# for h in handles:
# if h != handle:
# main_browser.switch_to_window(h)
# time.sleep(2)
# return main_browser.page_source
# else:
# return main_browser.page_source
time.sleep(2)
return main_browser.page_source
except TimeoutException:
lagou_search_key(keyword, main_browser, wait)
def lagou_next_page(main_browser, wait, max_page=100):
try:
page_click = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '.pager_next '))
)
page_click.click()
pager_is_current = main_browser.find_element_by_css_selector('.pager_is_current')
current_page = int(pager_is_current.text)
if current_page > max_page:
return None
return main_browser.page_source
except TimeoutException:
return None
pyQT5设计界面
部分代码如下:
class myWindow(QtWidgets.QMainWindow, Ui_Form):
def __init__(self):
super(myWindow, self).__init__()
self.setupUi(self)
self.lineEdit.setText("python")
self.lineEdit_2.setText("2")
self.lineEdit_3.setText(os.getcwd())
self.pushButton.clicked.connect(self.get_input)
def get_input(self):
self.setEnabled(False)
self.filename = self.lineEdit.text()
self.max_page = int(self.lineEdit_2.text())
self.save_path = self.lineEdit_3.text()
self.search_url = self.lineEdit_4.text()
self.b, self.w = create_browser()
html = lagou_search_key(self.filename, self.b, self.w, self.search_url)
csv_name = os.path.join(self.save_path, self.filename + '.csv')
headers = ['发布时间','职位链接', '职位', '职位位置','薪资', '基本要求', '公司', '公司规模', '公司链接']
lagou_csv_write(csv_name, headers, html)
while True:
time.sleep(2)
html = lagou_next_page(self.b, self.w, self.max_page)
if not html:
break
else:
lagou_csv_write(csv_name, headers, html, writeheader=False)
self.b.quit()
self.setEnabled(True)