背景
公司最近给的任务是让我去分析竞争对手的论坛数据。而我也是恰巧在学Python,故通过selenium 自动化的方法对目标网站进行爬虫
selenium
selenium是软件测试中常用到的一个工具,通过运行脚本的方式,模仿人进行浏览器操作,我是用的selenium2+基于火狐的webdriver +Python3。
源码
# -*- coding: utf-8 -*-
# @Time : 2019/3/29 11:54
# @Author : Merlin
# @Email : shiehmerlin@gmail.com
# @File : selenium.py
# @Software: PyCharm Community Edition
import time
from selenium import webdriver
import threading
import pymysql
# reload(sys)
# sys.setdefaultencoding('utf8')
# chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--disable-gpu')
# driver = webdriver.Chrome(chrome_options=chrome_options)
driver = webdriver.Firefox()
# driver.maximize_window()
# 全屏
url = 'http://kyddn.log56.com/sq_server/goSosPageAction.action?user_id=V1UxUFZXUmtkMnhDZUhOc1RtbElkMVpCYUU5TWR6MDk=&tag_id=2&source=sos&i=home&random=25.345469582047443'
driver.get(url)
time.sleep(2)
# time.sleep(4)
###
count = 7
i = 1
title = "/html/body/div[2]/div["
title2 = "]/div[2]/div[2]/span[3]"
content = "]/div[2]/div[3]/span"
datetime = "]/div[2]/div[1]/div[2]/span[2]"
state = "]/div[1]/span"
tag = "]/div[2]/div[2]/span[2]"
### 这是控件配置
###
nowtime = time.strftime('%Y-%m-%d', time.localtime(time.time()))
path = '..//file//' + 'kydd_' + nowtime + '.xls'
db = pymysql.connect(
'localhost',
'root',
'root',
'xxx'
)
cursor = db.cursor()
# 游标对象
def SQL(sql):
cursor.execute(sql)
return True
# def gundong():
# for m in range(0, 100000):
# driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
# time.sleep(0.1)
# try:
# threading.Thread(target=gundong).start()
# except:pass
def main():
count = 7
i = 1
while count < 10000:
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
tit = driver.find_element_by_xpath(title + str(count) + title2).text
con = driver.find_element_by_xpath(title + str(count) + content).text
datet = driver.find_element_by_xpath(title + str(count) + datetime).text
datet = "2019-" + datet
nowstate = driver.find_element_by_xpath(title + str(count) + state).text
biaoqian = driver.find_element_by_xpath(title + str(count) + tag).text
print(tit, con)
try:
sql = "INSERT INTO kydd_data (riqi,zhuangtai,biaoqian,neirong,biaoti) VALUES ('" + datet + "','" + nowstate + "','" + biaoqian + "','" + con + "','" + tit + "')"
SQL(sql)
except:
print("SQL_Error!")
count += 1
db.commit()
# db.close()
else:
print('Over!')
if __name__ == "__main__":
try:
main()
except:db.close()
```