公益数据爬虫本
代码
"""
Created on Sat Jan 27 21:56:47 2018
@author: caofk
"""
from selenium import webdriver
from pyquery import PyQuery as pq
from selenium.webdriver.common.action_chains import ActionChains
import time
import re
import pandas as pd
browser = webdriver.Firefox()
root = "http://gongyi.qq.com/succor/project_list.htm"
browser.get(root)
time.sleep(5 )
meta_info = pd.DataFrame()
for s_status in range(1 ,4 ):
choose = browser.find_element_by_css_selector("#s_status_text" )
ActionChains(browser).move_to_element(choose).perform()
time.sleep(5 )
choose = browser.find_element_by_css_selector("#s_status_list > li:nth-child(%d) > a:nth-child(1)" %s_status)
s_status_name = choose.text
choose.click()
for s_tid in range(2 ,7 ):
base_info = pd.DataFrame()
choose = browser.find_element_by_css_selector("#s_tid_text" )
ActionChains(browser).move_to_element(choose).perform()
time.sleep(5 )
choose = browser.find_element_by_css_selector("#s_tid_list > li:nth-child(%d) > a:nth-child(1)" %s_tid)
s_tid_name = choose.text
choose.click()
time.sleep(5 )
page_info = browser.find_element_by_css_selector("#projectPages_wrap" ).text
total_rows = re.findall(r"(\d+)条" ,page_info)
page_num = re.findall(r"(\d+)页" ,page_info)
init_url = browser.current_url
base_info["s_status_name" ] = [s_status_name]
base_info["s_status" ] = [s_status]
base_info["s_tid_name" ] = [s_tid_name]
base_info["s_tid" ] = [70 +s_tid-1 ]
base_info["row" ] = total_rows
base_info["p" ] = page_num
meta_info = pd.concat((meta_info,base_info))
url_pd = pd.DataFrame()
base_pd = pd.DataFrame()
info_pd = pd.DataFrame()
for index, row in meta_info.iterrows():
for col_name in meta_info.columns:
if col_name != "p" :
base_pd[col_name] = [row[col_name]]
else :
for p in range(1 ,int(row[col_name])+1 ):
base_pd["p" ] = p
base_pd["url" ] = root+"#s_status=%d&s_tid=%d&p=%d" %(row["s_status" ],row["s_tid" ],p)
url_pd = pd.concat((base_pd,url_pd))
page_pd = pd.DataFrame()
base_pd = pd.DataFrame()
i = 0
for index, row in url_pd.iterrows():
i = i+1
print(i)
for col_name in url_pd.columns:
if col_name != "url" :
base_pd[col_name] = [row[col_name]]
else :
base_pd[col_name] = [row[col_name]]
browser.get(row[col_name])
time.sleep(1 )
base_pd["item" ] = pq(browser.page_source)("#projectList_wrap" ).html()
page_pd = pd.concat((page_pd, base_pd))
item_pd = pd.DataFrame()
base_pd = pd.DataFrame()
const = ["" ]
i = 0
for index, row in page_pd.iterrows():
i = i+1
print(i)
for col_name in page_pd.columns:
if col_name != "item" :
base_pd[col_name] = [row[col_name]]
else :
for item in pq(row[col_name])(".pro_li" ):
text = pq(item).text().replace("|" ,"" ).replace("\xa0" ,"" )
base_pd["公益标题" ] = text.split("\n" )[0 ]
base_pd["公益链接" ] = 'http://gongyi.qq.com/succor/' +pq(item)(".titless" ).attr('href' )
try :
base_pd["公益简介" ] = re.findall(r'项目简介(.*?)筹款目标' , text.replace("\n" ,"" ))
except Exception as E1:
base_pd["公益简介" ] = const
try :
base_pd["筹款目标" ] = re.findall(r'筹款目标(.*?)筹款时间' , text.replace("\n" ,"" ))
except Exception as E2:
base_pd["筹款目标" ] = const
try :
base_pd["筹款时间" ] = re.findall(r'筹款时间(.*?)执 行 方' , text.replace("\n" ,"" ))
except Exception as E3:
base_pd["筹款时间" ] = const
try :
base_pd["执行方" ] = re.findall(r'执 行 方(.*?)项目状态' , text.replace("\n" ,"" ))
except Exception as E4:
base_pd["执行方" ] = const
try :
base_pd["项目状态" ] = re.findall(r'项目状态(.*?)已筹' , text.replace("\n" ,"" ))
except Exception as E5:
base_pd["项目状态" ] = const
try :
base_pd["筹款情况" ] = re.findall(r'已筹:(.*?)人次捐款' , text.replace("\n" ,"" ))
except Exception as E6:
base_pd["筹款情况" ] = const
try :
base_pd["筹款进度" ] = re.findall(r'人次捐款(.*?)我要捐款' , text.replace("\n" ,"" ))
except Exception as E1:
base_pd["筹款进度" ] = const
item_pd = pd.concat((item_pd, base_pd))
item_pd.to_csv("E:\\公益数据.csv" )
数据结果