JAVA爬取虎嗅网截图_python爬取虎嗅网数据

#!/usr/bin/env python

# -*- coding:utf-8 -*-

import requests

import pymongo

from bs4 import BeautifulSoup

client = pymongo.MongoClient(host='localhost',port=27017)

collection = client['spiders']['huxiu']

url = "https://www.huxiu.com/channel/ajaxGetMore"

headers={

"Referer":"https://www.huxiu.com/channel/104.html",

"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36"

}

def get_total_page():

data = {

"huxiu_hash_code": "9501c2ced764ebbe029807a9f17790fa",

"page": 1,

"catId": 104

}

r = requests.post(url, data=data, headers=headers)

res_json = r.json()

total = res_json['data']['total_page']

return total

def main(page):

data = {

"huxiu_hash_code": "9501c2ced764ebbe029807a9f17790fa",

"page": page,

"catId": 104

}

r = requests.post(url, data=data, headers=headers)

res_json = r.json()

data = res_json['data']['data']

return data

def parse_data(data):

bs = BeautifulSoup(data, "lxml")

for item in bs.find_all("div",attrs={"class":"mod-art"}):

json_atricle={}

json_atricle["article_aid"] = item["data-aid"]

a_node = item.find("a",attrs={"class":"transition"})

json_atricle["article_title"] = a_node['title']

json_atricle["article_ulr"] = a_node["href"]

img_node = a_node.find("img")

json_atricle["article_img"] = img_node.get("data-original") if img_node.get("data-original") else img_node.get("src")

author_face_node = item.find("div",attrs={"class":"author-face"})

json_atricle["member_url"] = author_face_node.find('a')["href"]

json_atricle["author_face"] = author_face_node.find('img')["src"]

json_atricle["author_name"]= item.find("span",attrs={"class":"author-name"}).string

#数据入库

collection.insert(json_atricle)

print("success")

if __name__ =="__main__":

pages = get_total_page()

for page in range(1,(pages +1)):

print("正在爬去第{}页".format(page))

data = main(page)

parse_data(data)

2018101104281257.png

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值