爬取文章写入doc文件中

#-*-coding:utf-8-*-
import requests

import os
import time

from docx.shared import Inches
import docx
from bs4 import BeautifulSoup

headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0"
}


def get_html(url):
	request = requests.get(url, headers=headers)
	return request.text

baseUrl = "https://www.jianshu.com"
url = "https://www.jianshu.com/c/1111111"

def download_img(href):
	url = "https:" + href
	name = url.split("/")[::-1][0]

	if os.path.exists("./pic/"+name):
		return "./pic/"+name

	rep = requests.get(url, headers=headers)
	f = open("./pic/"+name, "wb+")
	f.write(rep.content)
	f.close()
	return "./pic/"+name

text = get_html(url)

html = BeautifulSoup(text, 'html.parser')

#articleNums = html.find_all("div", {"class": "meta-block"})[2].find("p").text
#articleNums = int(articleNums)
#pages = articleNums//9
#if pages * 9 < articleNums:
#    pages += 1

pages = 100
for pageNum in range(1, pages + 1):
    articleListUrl = url + '?order_by=shared_at&page=' + str(pageNum)
    req = requests.get(articleListUrl, headers=headers)
    html = BeautifulSoup(req.text, 'html.parser')
    ul = html.find("ul", {"class":"note-list"})
    lis = ul.find_all("li")
    for li in lis:
        titleBlock = li.find("a", {"class": "title"})
        title = titleBlock.text
        href = baseUrl + titleBlock['href']

	print href
	req1 = requests.get(href, headers = headers)
    	html1 = BeautifulSoup(req1.text, 'html.parser')
    	ul1 = html1.find("article", {"class":"_2rhmJa"})

	if "/" in title:
		title = title.replace("/", "_")
	if "\\" in title:
		title = title.replace("\\", "_")
	_file = "./all/" + title
	if os.path.exists(_file+".docx"):
	    _file = _file + "-" + str(int(time.time()))
	print _file
	doc = docx.Document()
	'''
	fp = open(_file, "wb+")
	s = ul1.find_all("p")
	for one in s:
		line = str(one)[3:-4]
		print line
		if len(line) == 0:
			fp.write("\n")
			continue
		if "<br/>" in line:
			line = line.replace("<br/>", "\n")
		fp.write(line)
		fp.write("\n")

		doc.add_paragraph(line.decode('utf-8'))
	fp.close()
	'''
	s = str(ul1).split("</p>")
	for one in s:
		line = str(one)
		pic = False
		imgUrl = ""
		if "data-original-src" in line:
			tmp = line.split("data-original-src=\"")[1]
			for a in tmp:
				if a == "\"":
					break
				imgUrl += a
			if len(imgUrl) > 0:
				pic = True
		if pic:
			pic = False
			_path = download_img(imgUrl)
			try:
			    doc.add_picture(_path, width=Inches(6.0))
			except docx.image.exceptions.UnrecognizedImageError, e:
			    print "-------------->",_path
			    pass

		if "<p>" in line:
			line = line.split("<p>")[1]
		if "<br/>" == line:
			line = "\n"
		if "article" in line:
			continue
		if len(line) > 5 and line[-5:] == "<br/>":
			line = line[:-5] + "\n"
		if "<b>" == line[:3]:
			line = line[3:]
		if "</b>"  == line[-4:]:
			line = line[:-4]

		doc.add_paragraph(line.decode('utf-8'))
	doc.save(_file + ".docx")



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值