import urllib.request
from selenium import webdriver
import time
import csv
import threading
import pandas as pd
from lxml import etree
from lxml.etree import HTMLParser
def main(csvFile):
reader = csv.reader(csvFile) # 返回的是迭代类型
list1 = []
for item in reader:
print("编码值有:",item[0])
chrome_driver = 'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe' # chromedriver的文件位置
# driver = webdriver.Chrome(executable_path=chrome_driver)
option = webdriver.ChromeOptions()
option.add_argument('headless') # 设置option
driver = webdriver.Chrome(chrome_options=option,executable_path=chrome_driver) # 调用带参数的谷歌浏览器
driver.get('http://www.somsds.com/')
driver.find_element_by_name('keyword').send_keys(item[0])
driver.find_element_by_xpath(".//input[@value='cas']").click()
driver.find_element_by_name('Submit').click()
# 切换当前页面标签
driver.switch_to.window(driver.window_handles[1])
time.sleep(1)
print('当前浏览地址为:.{0}'.format(driver.current_url))
flag = driver.find_element_by_class_name('ms_l').text
# flag = urllib.request.urlopen(driver.current_url).read()
# print('返回数据为:.{0}'.format(flag))
if "对不起,没有搜索到符合条件的MSDS数据,您可以访问" not in flag:
driver.find_element_by_link_text(item[0]).click()
time.sleep(1)
# 切换当前页面标签
driver.switch_to.window(driver.window_handles[2])
time.sleep(1)
#attribute = driver.page_source 不用这个方法,这个方法保存源代码不完整
#print('当前浏览地址为:.{0}'.format(driver.current_url))
#python3.x里面只有urllib3,urllib的request模块可以非常方便地抓取URL内容,也就是发送一个GET请求到指定的页面,然后返回HTTP的响应
attribute = urllib.request.urlopen(driver.current_url).read().decode('gb2312')
#print(attribute)
else:
attribute = '对不起,没有搜索到符合条件的MSDS数据'
#results_file = open('conclusionMessage.csv', 'a+', newline='')
list1.append([item[0],attribute]) # 将每一行数据保存到list中去
driver.quit()
for i in range(len(list1)):
print(list1[i])
df1 = pd.DataFrame(list1)
#将输出结果保存到CSV文件汇总
df1.to_csv('conclusionMessage.csv', index=None, header=None)
csvFile.close()
if __name__ == '__main__':
#要传入的数据集读取
csvFile = open("csvData.csv", "r")
main(csvFile)
结构:
运行结果: