python 爬取投融界专业平台

完整代码下载:

https://github.com/tanjunchen/SpiderProject/blob/master/tourongzi/Spider.py

#!/usr/bin/python
# -*- coding: UTF-8 -*-

import requests
from fake_useragent import UserAgent
import pandas as pd
from lxml import etree
import re

ua = UserAgent()
headers = {"User-Agent": ua.random}


def job():
    url = "https://www.trjcn.com/investor_data.html?type=1&page={0}"
    headers.pop("Referer", url.format(1))
    total = get_total(url, headers)
    dates = []
    events = []
    sources = []
    destinations = []
    moneys = []
    realms = []
    for page in range(1, total + 1):
        res = requests.get(url.format(page), headers=headers)
        res.encoding = "utf-8"
        print("正在抓取第", str(page) + "页")
        if res.ok:
            html = etree.HTML(res.text)
            if html is not None:
                trs = html.xpath("//div[@class='fn-left list']/table//tr")
                for i in range(1, len(trs)):
                    tds = trs[i].xpath(".//td")
                    date = "-".join(tds[0].xpath(".//text()"))
                    event = "-".join(tds[1].xpath(".//text()"))
                    source = "-".join(tds[2].xpath(".//text()"))
                    destination = "-".join(tds[3].xpath(".//text()"))
                    money = "-".join(tds[4].xpath(".//text()"))
                    realm = "-".join(tds[5].xpath(".//text()"))
                    dates.append(date)
                    events.append(event)
                    sources.append(source)
                    destinations.append(destination)
                    moneys.append(money)
                    realms.append(realm)
                    print("正在抓取第", str(page) + "页", date, event, source, destination, money, realm)
    all_data = []
    for i in range(len(events)):
        all_data.append([dates[i], events[i], sources[i], destinations[i], moneys[i], realms[i]])
    df = pd.DataFrame(all_data)
    new_col = ['日期', '融资事件', '融资方', '投资方', '金额与轮次', '融资领域']
    df.columns = new_col
    df.to_csv("all_data.csv", encoding="utf-8", index=False)


def get_total(url, header):
    headers.pop("Referer", url)
    res = requests.get(url, headers=header)
    res.encoding = "utf-8"
    html = etree.HTML(res.text)
    total = re.findall('\d+', html.xpath("//div[@class='paging fn-right']//a[last()]/@href")[0])[-1]
    return int(total)


if __name__ == '__main__':
    job()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

远方的飞猪

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值