昨天突发奇想将简书的各种文章爬了一遍,我也不知道最后爬了多少,Kelvin到自己渣渣的电脑,并没有将文章内容爬下来,只是存了文章的名字,作者,url和开头的第一句话。
首先分析一下简书这个网站,会发现这个网站不是很难,适合给我提升信心。不多说了 我把保存到数据库的数据给大家看一下。
第一个框是各个专题,剩下几个就是我需要爬取的内容
import re
import requests
from bs4 import BeautifulSoup
for n in range(5):
url = 'http://www.jianshu.com/recommendations/collections?page=%s&order_by=recommend'%n
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"}
html = requests.get(url, headers)
soup = BeautifulSoup(html.text,'lxml')
hrefs = soup.find_all(class_='collection-wrap',attrs = {'a','href'})
for href in hrefs:
partent = re.compile('<a href="(.*?)" target="_blank">.*?</a>',re.S )
result = re.search(partent ,str(href)).groups(0)[0]
# print(result )
base_url = 'http://www.jianshu.com'+result
# print(base_url )
import pymongo
import re
import requests
import time
from bs4 import BeautifulSoup
from 简书专题 import base_url
# 1 打开数据库连接,mongodb默认端口为27017
conn = pymongo.MongoClient(host='localhost',port=27017)
# 2 选择或创建数据库
jianshu= conn['jianshu']
# 3 选择或创建数据集合
dashuseng = jianshu['jianshuDA']
def get_list(ID):
url = base_url+"?order_by=added_at&page=%s"%ID
headers= {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"}
response = requests.get(url,headers )
time.sleep(1)
soup = BeautifulSoup(response.text ,'lxml' )
note_list = soup.find_all('div',class_='content')
for note in note_list :
title = note.find(class_='title').text
author = note.find(class_='blue-link').text
charget = note.find(class_='abstract').text
pattern = re.compile('<a class="title" href="(.*?)" target="_blank">.*?</a>', re.S)
charget_url = 'http://www.jianshu.com' + re.search(pattern ,str(note)).groups(0)[0]
# wenzs = BeautifulSoup( requests.get(charget_url,headers ).text ,'lxml' )
# wenz = wenzs.find_all(attrs={'show-content','p'})
# pattern2 = re.compile('<p>(.*?)</p>')
# dd = re.search(pattern2,str(wenz))
# print(charget_url )
# print(dd)
dic = {
"title": title ,
"author": author,
"charget": charget,
"charget_url":charget_url,
# "dd":dd
}
dashuseng.insert_one(dic)
# print(title ,author ,charget ,charget_url)
# def save_to_mongodb(self):
for ID in range(1000):
get_list(ID)
if get_list(ID) is False :
break
我刚又加上了一些代理啥的还是不大行,这个反爬我没有弄明白最后的数据只有3000条,可惜啊