#-*-coding:utf-8-*-
import requests
import os
import time
from docx.shared import Inches
import docx
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0"
}
def get_html(url):
request = requests.get(url, headers=headers)
return request.text
baseUrl = "https://www.jianshu.com"
url = "https://www.jianshu.com/c/1111111"
def download_img(href):
url = "https:" + href
name = url.split("/")[::-1][0]
if os.path.exists("./pic/"+name):
return "./pic/"+name
rep = requests.get(url, headers=headers)
f = open("./pic/"+name, "wb+")
f.write(rep.content)
f.close()
return "./pic/"+name
text = get_html(url)
html = BeautifulSoup(text, 'html.parser')
#articleNums = html.find_all("div", {"class": "meta-block"})[2].find("p").text
#articleNums = int(articleNums)
#pages = articleNums//9
#if pages * 9 < articleNums:
# pages += 1
pages = 100
for pageNum in range(1, pages + 1):
articleListUrl = url + '?order_by=shared_at&page=' + str(pageNum)
req = requests.get(articleListUrl, headers=headers)
html = BeautifulSoup(req.text, 'html.parser')
ul = html.find("ul", {"class":"note-list"})
lis = ul.find_all("li")
for li in lis:
titleBlock = li.find("a", {"class": "title"})
title = titleBlock.text
href = baseUrl + titleBlock['href']
print href
req1 = requests.get(href, headers = headers)
html1 = BeautifulSoup(req1.text, 'html.parser')
ul1 = html1.find("article", {"class":"_2rhmJa"})
if "/" in title:
title = title.replace("/", "_")
if "\\" in title:
title = title.replace("\\", "_")
_file = "./all/" + title
if os.path.exists(_file+".docx"):
_file = _file + "-" + str(int(time.time()))
print _file
doc = docx.Document()
'''
fp = open(_file, "wb+")
s = ul1.find_all("p")
for one in s:
line = str(one)[3:-4]
print line
if len(line) == 0:
fp.write("\n")
continue
if "<br/>" in line:
line = line.replace("<br/>", "\n")
fp.write(line)
fp.write("\n")
doc.add_paragraph(line.decode('utf-8'))
fp.close()
'''
s = str(ul1).split("</p>")
for one in s:
line = str(one)
pic = False
imgUrl = ""
if "data-original-src" in line:
tmp = line.split("data-original-src=\"")[1]
for a in tmp:
if a == "\"":
break
imgUrl += a
if len(imgUrl) > 0:
pic = True
if pic:
pic = False
_path = download_img(imgUrl)
try:
doc.add_picture(_path, width=Inches(6.0))
except docx.image.exceptions.UnrecognizedImageError, e:
print "-------------->",_path
pass
if "<p>" in line:
line = line.split("<p>")[1]
if "<br/>" == line:
line = "\n"
if "article" in line:
continue
if len(line) > 5 and line[-5:] == "<br/>":
line = line[:-5] + "\n"
if "<b>" == line[:3]:
line = line[3:]
if "</b>" == line[-4:]:
line = line[:-4]
doc.add_paragraph(line.decode('utf-8'))
doc.save(_file + ".docx")
爬取文章写入doc文件中
最新推荐文章于 2023-11-08 18:33:51 发布