爬虫: 爬取天天基金网的公司信息
# -*- coding: UTF-8 -*-
import requests
import parsel
import re
import pandas as pd
def tiantianjijin_main():
# 设置要爬取的url及headers, headers表明该:浏览器、 系统win10 64位、 browser内核
base_url = 'http://fund.eastmoney.com/company/default.html'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
# 请求url的全部内容,请求成功返回200
response = requests.get(url=base_url, headers=headers)
# print(response)
# 设置显示中文,并获取全部html数据
response.encoding = response.apparent_encoding
html_data = response.text
# print(html_data)
# 设置请求数据的selector,方便后续进行过数据分析与处理
selector = parsel.Selector(html_data)
# result_list = selector.xpath('//tr[@class]').getall()
result_list = selector.xpath('//tr[@class]')
# print(len(result_list))
# print(result_list[1])
# 数据处理与保存
cnt = 100 # 设置想要获取的公司数量
i = 0
company_info_list = []
for one_result in result_list:
company_info_temp = []
# company_name = one_result.xpath('//td[@class="td-align-left"]').get()
company_name = one_result.xpath('./td[@class="td-align-left"]/a[@href]').get()
company_scale = one_result.xpath('./td[@class="scale number "]/p[@class="td-gm"]').get()
if company_name is None:
continue
# print(company_name)
# print(company_scale)
# 筛选公司名称与规模scale
company_name = re.findall(r'.html">(.*)</a>', company_name)[0].strip()
company_scale = re.findall(r'<p class="td-gm">(.*)<span class=', company_scale)[0].strip()
print(company_name)
print(company_scale)
company_info_temp.append(company_name)
company_info_temp.append(company_scale)
company_info_list.append(company_info_temp)
i += 1
if i >= cnt:
break
# 把公司名称及规模写入excel文件
df = pd.DataFrame(company_info_list, columns=['company_name', 'company_scale'], index=list(range(1, len(company_info_list) + 1))) #index行数默认从0开始,改为从1开始,method 1
# df = pd.DataFrame(company_info_list, columns=['company_name', 'company_scale']) #index行数默认从0开始,改为从1开始,method 2
# df.index += 1
df.to_excel("tiantianjijin.xlsx", index=True)
print(company_info_list)
print('tiantianjijin completed.')
if __name__ == '__main__':
tiantianjijin_main()
结果: