python 爬取投融界专业平台

远方的飞猪

已于 2023-07-30 14:27:59 修改

阅读量229

点赞数

分类专栏：爬虫 Python 文章标签： python xpath

于 2020-09-20 11:04:06 首次发布

本文链接：https://blog.csdn.net/tanjunchen/article/details/108675845

版权

爬虫同时被 2 个专栏收录

22 篇文章 4 订阅

订阅专栏

Python

21 篇文章 1 订阅

订阅专栏

完整代码下载：

https://github.com/tanjunchen/SpiderProject/blob/master/tourongzi/Spider.py

#!/usr/bin/python
# -*- coding: UTF-8 -*-

import requests
from fake_useragent import UserAgent
import pandas as pd
from lxml import etree
import re

ua = UserAgent()
headers = {"User-Agent": ua.random}


def job():
    url = "https://www.trjcn.com/investor_data.html?type=1&page={0}"
    headers.pop("Referer", url.format(1))
    total = get_total(url, headers)
    dates = []
    events = []
    sources = []
    destinations = []
    moneys = []
    realms = []
    for page in range(1, total + 1):
        res = requests.get(url.format(page), headers=headers)
        res.encoding = "utf-8"
        print("正在抓取第", str(page) + "页")
        if res.ok:
            html = etree.HTML(res.text)
            if html is not None:
                trs = html.xpath("//div[@class='fn-left list']/table//tr")
                for i in range(1, len(trs)):
                    tds = trs[i].xpath(".//td")
                    date = "-".join(tds[0].xpath(".//text()"))
                    event = "-".join(tds[1].xpath(".//text()"))
                    source = "-".join(tds[2].xpath(".//text()"))
                    destination = "-".join(tds[3].xpath(".//text()"))
                    money = "-".join(tds[4].xpath(".//text()"))
                    realm = "-".join(tds[5].xpath(".//text()"))
                    dates.append(date)
                    events.append(event)
                    sources.append(source)
                    destinations.append(destination)
                    moneys.append(money)
                    realms.append(realm)
                    print("正在抓取第", str(page) + "页", date, event, source, destination, money, realm)
    all_data = []
    for i in range(len(events)):
        all_data.append([dates[i], events[i], sources[i], destinations[i], moneys[i], realms[i]])
    df = pd.DataFrame(all_data)
    new_col = ['日期', '融资事件', '融资方', '投资方', '金额与轮次', '融资领域']
    df.columns = new_col
    df.to_csv("all_data.csv", encoding="utf-8", index=False)


def get_total(url, header):
    headers.pop("Referer", url)
    res = requests.get(url, headers=header)
    res.encoding = "utf-8"
    html = etree.HTML(res.text)
    total = re.findall('\d+', html.xpath("//div[@class='paging fn-right']//a[last()]/@href")[0])[-1]
    return int(total)


if __name__ == '__main__':
    job()