#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
import pymongo
from bs4 import BeautifulSoup
client = pymongo.MongoClient(host='localhost',port=27017)
collection = client['spiders']['huxiu']
url = "https://www.huxiu.com/channel/ajaxGetMore"
headers={
"Referer":"https://www.huxiu.com/channel/104.html",
"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36"
}
def get_total_page():
data = {
"huxiu_hash_code": "9501c2ced764ebbe029807a9f17790fa",
"page": 1,
"catId": 104
}
r = requests.post(url, data=data, headers=headers)
res_json = r.json()
total = res_json['data']['total_page']
return total
def main(page):
data = {
"huxiu_hash_code": "9501c2ced764ebbe029807a9f17790fa",
"page": page,
"catId": 104
}
r = requests.post(url, data=data, headers=headers)
res_json = r.json()
data = res_json['data']['data']
return data
def parse_data(data):
bs = BeautifulSoup(data, "lxml")
for item in bs.find_all("div",attrs={"class":"mod-art"}):
json_atricle={}
json_atricle["article_aid"] = item["data-aid"]
a_node = item.find("a",attrs={"class":"transition"})
json_atricle["article_title"] = a_node['title']
json_atricle["article_ulr"] = a_node["href"]
img_node = a_node.find("img")
json_atricle["article_img"] = img_node.get("data-original") if img_node.get("data-original") else img_node.get("src")
author_face_node = item.find("div",attrs={"class":"author-face"})
json_atricle["member_url"] = author_face_node.find('a')["href"]
json_atricle["author_face"] = author_face_node.find('img')["src"]
json_atricle["author_name"]= item.find("span",attrs={"class":"author-name"}).string
#数据入库
collection.insert(json_atricle)
print("success")
if __name__ =="__main__":
pages = get_total_page()
for page in range(1,(pages +1)):
print("正在爬去第{}页".format(page))
data = main(page)
parse_data(data)