# 公益数据爬虫本

公益数据爬虫本

代码

# -*- coding: utf-8 -*-
"""
Created on Sat Jan 27 21:56:47 2018

@author: caofk
"""

from selenium import webdriver
from pyquery import PyQuery as pq
from selenium.webdriver.common.action_chains import ActionChains
import time
import re
import pandas as pd

browser = webdriver.Firefox()
root = "http://gongyi.qq.com/succor/project_list.htm"
browser.get(root)
time.sleep(5)
meta_info = pd.DataFrame()
for s_status in range(1,4):
    choose = browser.find_element_by_css_selector("#s_status_text")
    ActionChains(browser).move_to_element(choose).perform()
    time.sleep(5)
    choose = browser.find_element_by_css_selector("#s_status_list > li:nth-child(%d) > a:nth-child(1)" %s_status)
    s_status_name = choose.text
    choose.click()
    for s_tid in range(2,7):
        base_info = pd.DataFrame()
        choose = browser.find_element_by_css_selector("#s_tid_text")
        ActionChains(browser).move_to_element(choose).perform()
        time.sleep(5)
        choose = browser.find_element_by_css_selector("#s_tid_list > li:nth-child(%d) > a:nth-child(1)" %s_tid)
        s_tid_name = choose.text
        choose.click()
        time.sleep(5)
        page_info = browser.find_element_by_css_selector("#projectPages_wrap").text
        total_rows = re.findall(r"(\d+)条",page_info) 
        page_num = re.findall(r"(\d+)页",page_info)
        init_url = browser.current_url
        base_info["s_status_name"] = [s_status_name]
        base_info["s_status"] = [s_status]
        base_info["s_tid_name"] = [s_tid_name]
        base_info["s_tid"] = [70+s_tid-1]
        base_info["row"] = total_rows
        base_info["p"] = page_num
        meta_info = pd.concat((meta_info,base_info))

url_pd = pd.DataFrame()
base_pd = pd.DataFrame()
info_pd = pd.DataFrame()
for index, row in meta_info.iterrows():
    for col_name in meta_info.columns:
        if col_name != "p":
                base_pd[col_name] = [row[col_name]]
        else:
            for p in range(1,int(row[col_name])+1):
                base_pd["p"] = p
                base_pd["url"] = root+"#s_status=%d&s_tid=%d&p=%d" %(row["s_status"],row["s_tid"],p)
                url_pd = pd.concat((base_pd,url_pd))


page_pd = pd.DataFrame()
base_pd = pd.DataFrame()
i = 0
for index, row in url_pd.iterrows():
    i = i+1
    print(i)
    for col_name in url_pd.columns:
        if col_name != "url":
            base_pd[col_name] = [row[col_name]]
        else:
            base_pd[col_name] = [row[col_name]]
            browser.get(row[col_name])
            time.sleep(1)
            base_pd["item"] = pq(browser.page_source)("#projectList_wrap").html()
            page_pd = pd.concat((page_pd, base_pd))

item_pd = pd.DataFrame()       
base_pd = pd.DataFrame()
const = [""]
i = 0
for index, row in page_pd.iterrows():
    i = i+1
    print(i)
    for col_name in page_pd.columns:
        if col_name != "item":
            base_pd[col_name] = [row[col_name]]
        else:
            for item in pq(row[col_name])(".pro_li"):
                text = pq(item).text().replace("|","").replace("\xa0","")
                base_pd["公益标题"] = text.split("\n")[0]
                base_pd["公益链接"] = 'http://gongyi.qq.com/succor/'+pq(item)(".titless").attr('href')
                try:
                    base_pd["公益简介"] = re.findall(r'项目简介(.*?)筹款目标', text.replace("\n",""))
                except Exception as E1:
                    base_pd["公益简介"] = const
                try:   
                    base_pd["筹款目标"] = re.findall(r'筹款目标(.*?)筹款时间', text.replace("\n",""))
                except Exception as E2:
                    base_pd["筹款目标"] = const
                try:   
                    base_pd["筹款时间"] = re.findall(r'筹款时间(.*?)执 行 方', text.replace("\n",""))
                except Exception as E3:
                    base_pd["筹款时间"] = const
                try:   
                    base_pd["执行方"] = re.findall(r'执 行 方(.*?)项目状态', text.replace("\n",""))
                except Exception as E4:
                    base_pd["执行方"] = const
                try:   
                    base_pd["项目状态"] = re.findall(r'项目状态(.*?)已筹', text.replace("\n",""))
                except Exception as E5:
                    base_pd["项目状态"] = const
                try:   
                    base_pd["筹款情况"] = re.findall(r'已筹:(.*?)人次捐款', text.replace("\n",""))
                except Exception as E6:
                    base_pd["筹款情况"] = const
                try:
                    base_pd["筹款进度"] = re.findall(r'人次捐款(.*?)我要捐款', text.replace("\n",""))
                except Exception as E1:
                    base_pd["筹款进度"] = const
                item_pd = pd.concat((item_pd, base_pd))

item_pd.to_csv("E:\\公益数据.csv") 

数据结果

这里写图片描述

  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值