声明:代码仅作学习交流用途,代码分享者与创作者不承担任何由他人恶意运行而导致的责任,勿擅自修改限制频率的参数,勿恶意攻击网页,请学习浏览者遵守社会公德与法律秩序,爬虫导致的网页崩溃等损失由计算机操作者负全部责任,造成严重后果的需要承担刑事责任
爬虫代写:邮箱 leon_leon@yeah.net
import requests
from fake_useragent import UserAgent
from lxml import etree
from time import sleep
from random import randint
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
code_info = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\上市公司代码.csv',encoding='UTF-8')
print(code_info)
row_num = len(code_info)
print(row_num)
url_Manger = []
url_bese = 'http://www.sse.com.cn/assortment/stock/list/info/company/index.shtml?COMPANY_CODE={}'
company_code = []
company_date = []
company_name = []
company_site = []
company_address = []
company_csrc_type = []
company_sse_type = []
company_province =[]
company_status = []
company_shangzheng180 = []
company_is_overseas = []
company_overseas = []
for i in range(row_num):
url_info = code_info.iloc[i,0]
url_Manger.append(url_bese.format(url_info))
print(url_Manger)
#index1 = [521,1027,1355,1523]
for i in range(row_num):#index1:
print(i+1,'\t','{}'.format(i/row_num*100) + '%','\t',url_Manger[i])
# headers={
# 'User-Agent':UserAgent().chrome
# }
# proxies = {
# "http": "http://35.236.158.232:8080" #找ip代理即可使用
# }
#html_response=requests.get(url_Manger[i],headers=headers,proxies=proxies)
url_company = url_Manger[i]
edge = webdriver.Edge()
edge.get(url_company)
# edge.get('http://www.sse.com.cn/assortment/stock/list/info/company/index.shtml?COMPANY_CODE=605111')
sleep(randint(1,3))
html_response = edge.page_source
e = etree.HTML(html_response)
company_code.append(''.join(e.xpath('''//table[@class = 'table search_']/tbody/tr[1]/td[1]/text()''')))
company_date.append(''.join(e.xpath('''//table[@class = 'table search_']/tbody/tr[3]/td[1]/a/text()''')))
company_name.append(''.join(e.xpath('''//table[@class = 'table search_']/tbody/tr[6]/td[1]/text()''')))
company_site.append(''.join(e.xpath('''//table[@class = 'table search_']/tbody/tr[7]/td[1]/text()''')))
company_address.append(''.join(e.xpath('''//table[@class = 'table search_']/tbody/tr[8]/td[1]/text()''')))
company_csrc_type.append(''.join(e.xpath('''//table[@class = 'table search_']/tbody/tr[14]/td[1]/text()''')))
company_sse_type.append(''.join(e.xpath('''//table[@class = 'table search_']/tbody/tr[15]/td[1]/text()''')))
company_province.append(''.join(e.xpath('''//table[@class = 'table search_']/tbody/tr[16]/td[1]/text()''')))
company_status.append(''.join(e.xpath('''//table[@class = 'table search_']/tbody/tr[17]/td[1]/text()''')))
company_shangzheng180.append(''.join(e.xpath('''//table[@class = 'table search_']/tbody/tr[18]/td[1]/text()''')))
company_is_overseas.append(''.join(e.xpath('''//table[@class = 'table search_']/tbody/tr[19]/td[1]/text()''')))
company_overseas.append(''.join(e.xpath('''//table[@class = 'table search_']/tbody/tr[20]/td[1]/text()''')))
edge.close()
all_info={
'上市公司代码':company_code,'上市时间':company_date,'上市公司名称':company_name,'公司注册地址':company_site,'公司通讯地址':company_address,'csrc分类':company_csrc_type,'sse分类':company_sse_type,'所属省份':company_province,'公司状态':company_status,
'上证180':company_shangzheng180,'是否境外上市':company_is_overseas,'境外上市地':company_overseas
}
outdata = pd.DataFrame(all_info)
outdata.to_csv('上市公司信息补充.csv', encoding='GBK')
print('爬虫已完成')