#1.数据例子演示
不是按0开始数就是从1开始数
view-source:http://www.hdb.com/party/a0lz2.html 第六个
#2.开始撸代码(python3.6)
只是部分主要代码
import requests
from bs4 import BeautifulSoup
import js2xml
from lxml import etree
class HdbSpider(scrapy.Spider):
name = 'hdb'
allowed_domains = ['http://www.hdb.com/']
start_urls = ['http://www.hdb.com/']
#全国
globalUrl = ['http://www.hdb.com/quanguo/']
def url(self):
url = http://www.hdb.com/party/a0lz2.html
yield scrapy.Request(url,self.parse,dont_filter=True)
def parse(self,response):
#主要内容
resp = response.text
soup = BeautifulSoup(resp, 'lxml')
src = soup.select('head script')[6].string
src_text = js2xml.parse(src, debug=False)
src_tree = js2xml.pretty_print(src_text)
print('treeeeeeeeeeeeeeeeeeeeeeeeeeeee')
print(src_tree)
#生成结果展示图一
selector = etree.HTML(src_tree)
# print(selector)
#自己去匹配自己想要的数据
content = selector.xpath("//property[@name = '_id']/string/text()")[0]
print(content)
图一
详细代码地址
git@github.com:yzw1/python-Reptilian-content.git
参考文章
1. https://blog.csdn.net/fan3652/article/details/72780301(去除里面的内容)
2. https://blog.csdn.net/qq_34246164/article/details/80700399
3. https://blog.csdn.net/freeking101/article/details/64461574