Python爬虫开发-08--遇到了好多困难头都大了终于搞好了

本文链接：https://blog.csdn.net/Sarline/article/details/80296126

# coding:utf-8
from lxml import etree
import io
import sys 
import time
reload(sys) 
sys.setdefaultencoding('utf-8')
from bs4 import BeautifulSoup
# from lxml import etree
import requests
import re
# 这个是为了将爬取的数据存储，并可以已汉字存储的所做的尝试，结果除了wb以外，其他utf-8、w都不可以，可能存在更好的方法，但是智商有限，就用这个啦！
# with io.open('F:/Python/daomubiji.doc', 'wb') as f:
with io.open('F:/Python/daomubiji.doc', 'wb') as f:

	user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
	headers = {
		'User-Agent':'user_agent'
	}
	r = requests.get('http://www.jueshitangmen.info/daomubiji')
	# print r.content
	# 通过分析页面可知标题被分在<div class = 'bg'></div>,章节在<div class="panel">中的<span><a></span>中。bookmark
	soup = BeautifulSoup(r.content, 'html.parser', from_encoding = 'utf-8')
	for bg in soup.find_all(class_ = 'bg'):
		h2 = bg.find('h2')
		if h2 != None:
			# 获取标题
			h2_title = h2.string
			# 获取所有a的标记中的URL和章节内容
			for a in bg.find(class_ = 'panel').find_all('a'):
				# 这里就是尝试爬取<a>标签里的标题所做的尝试，除了string都失败了
				# href = a.get('href')
				# mulu_title = a.xpath('./@title')[0].encode('utf-8')
				# a1 = a.string
				# print a1
				# p = re.compile(ur'[\u4e00 - \u9fa5]')
				# a_title = re.search(p, a)
				# book_mark = a_title.find('<a[^<>]*?>\s*(.*?)\s*</a>')
				if a != None:
					# 获取标题
					a_title = a.string
					f.write('{}\n'.format(a_title))
				else:
					print 'failed'