# 第一题
import re
strs = """<html>
<head>
<meta charset="UTF-8">
<title>学号尾号为9</title>
</head>
<body>
<h1>呼和浩特:蔬菜价格同比去年上涨37.71%</h1>
<p>近日,全国蔬菜价格普遍上涨,呼和浩特市民的“菜篮子”怎么样?</p>
<p>11月23~24日,记者走访了美通、东瓦窑、百家润、笙鲜农场、北京华联等各大批发市场、超市看到,受季节、天气等多重因素影响,一些生活必需品略有上涨,但供应充足,品种丰富。</p>
<p>蔬菜价格同比去年上涨37.71%</p>
<p>在东瓦窑农副产品批发市场,各类蔬菜琳琅满目,黄瓜5.0元/斤,西红柿4.5元/斤,本地豆角7.5元/斤,大白菜1.5元/斤,白萝卜2.0元/斤,土豆1.5元/斤。</p>
<p>商贩们说,今年菜价相比去年略高一些,其中叶类菜、豆角、辣椒、黄瓜等价格涨幅较高。</p>
<p>在百家润等生鲜超市,记者了解到,部分叶类菜的价格高于往年,其中,菠菜、油麦菜6.98元/斤,油菜4.20元/斤,芹菜4元/斤。</p>
</body>
</html>"""
model1 = '<h1>(.*?)</h1>'
model2 = '<p>(.*?)</p>'
result1 = re.findall(model1, strs, re.S)
result2 = re.findall(model2, strs, re.S)
print(result1)
print(result2)
# 第二题
import requests
import re
import csv
#获取网页源代码
url = r'https://tieba.baidu.com/p/7544967041'
html = requests.get(url).content.decode()
print(html)
#获取用户名
username_list = re.findall('username="(.*?)".*?class=', html, re.S)
print(username_list)
# 查看“发帖内容”
content_list = re.findall('class="d_post_content.*?j_d_post_content.*?style="display:;">(.*?)</div>',html,re.S)
print(content_list)
#获取发帖时间
reply_time_list = re.findall('date":"(20.*?)"',html,re.S)
print(reply_time_list)
data_list = []
for i in range(len(username_list)):
result = {'username':username_list[i],
'content':content_list[i],
'reply_time':reply_time_list[i]
}
data_list.append(result)
with open(r'D:\Temp\tieba123456.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['username', 'content', 'reply_time'])
writer.writeheader()
writer.writerows(data_list)
# 第三题
import lxml.html
import requests
url = r'https://www.kanunu8.com/book2/11135/index.html'
response = requests.get(url)
html_txt = response.content.decode(encoding="gbk")
selector = lxml.html.fromstring(html_txt)
info1 = selector.xpath('//div[@class="main"]')
print(info1)
info2 = info1[0].xpath('//div[@class="col-left"]')
print(info2)
info3 = info2[0].xpath('//div[@class="book"]')
print(info3)
info4 = info3[0].xpath('//dl')
print(info4)
info5 = info4[0].xpath('//dd')
print(info5)
info6 = []
for element in info5:
# result = element.xpath('./a/@href')
# 或者
result = element.xpath('a/@href')
if result:
info6.append(result[0])
# 返回一个列表对象
# 打印列表
for u in info6:
print(u)
info7 = []
for element in info6:
info7.append('https://www.kanunu8.com/book2/10986/'+ element)
print(info7)
#第三题222
import requests
import lxml.html
import re
import os
url = r'https://www.kanunu8.com/book2/11137/index.html'
response = requests.get(url)
html_cede = response.content.decode(encoding="gbk")
selector = lxml.html.fromstring(html_cede)
info =selector.xpath('//div[@class="main"]/div[@class="col-left"]/div[@class="book"]/dl/dd/a/@href')
info1 = selector.xpath('//div[@class="main"]')
info1
info2 = info1[0].xpath('./div[@class="col-left"]')
info2
info3 = info2[0].xpath('./div[@class="book"]')
info3
info4 = info3[0].xpath('./dl')
info4
info5 = info4[0].xpath('//dd')
info5
info6 = []
for temp in info5:
result = temp.xpath('./a/@href')
if result:
info6.append(result[0])
info6
info7 = []
for element in info:
result.append('https://www.kanunu8.com/book2/11137/'+ element)
for u in result:
print(u)
#第3题数据库
import csv
import requests
import re
import os.path
from lxml import html
#运行mongdb数据库
win+r
mongo
# 首先获取网页代码
url = r"网址"
response = requests.get(url).content.decode()
# 解析网页
ource_code = html.fromstring(response)
#按照从大到小的顺序提取节点
info1 = selector.xpath('//div[@class="main"]')
info2 = info1[0].xpath('./div[@class="col-left"]')
info3 = info2[0].xpath('./div[@class="book"]')
info4 = info3[0].xpath('./dl')
info5 = info4[0].xpath('//dd')
info6 = []
for temp in info5:
result = temp.xpath('./a/@href')
if result:
info6.append(result[0])
info7 = []
#章节所以网页
for element in info6:
info7.append('https://www.kanunu8.com/book2/10986/'+ element)
#循环爬取每页信息
for i in range(18):
chapter = requests.get(info7[i])
chapter_txt = chapter.content.decode(encoding='gbk')
content_node = lxml.html.fromstring(chapter_txt)
#爬取章节内容
title = content_node.xpath('//div[@id="Article"]/h1/text()')
#爬取作者
auther = content_node.xpath('//div[@id="Article"]/h1/span/strong/text()')
#地址
s='https://www.kanunu8.com/book2/10986/'+ element
#爬取文章内容
txt_list = content_node.xpath('//div[@class="text"]/p/text()')
#存入数据库
student1 = {
'章节': title ,
'作者': auther,
'地址': s,
'文章': txt_list }
result = collection.insert(student1)
#4
# 首先获取网页代码
url = '网址'
response = requests.get(url).content.decode()
# 解析网页
source_code = html.fromstring(response)
# 提取外层节点
html_node = source_code.xpath('//*/article[@class="article"]')
# 提取字段行
row_key = html_node[0].xpath('table/tbody/tr/th/text()')
# 提取行数据
row_value = html_node[0].xpath('table/tbody/tr/td/text()')
#存数据库
#连接库
client = pymongo.MongoClient('mongodb://localhost:27017')
#指定库指定集
db = client['库名']
collection = db['集名']
#存入
str_1 = {
"字段行":row_key,
"行数据":row_value
}
result = collection.insert(str_1)
#100强
import requests
import pymongo
from lxml import html
# 首先获取网页代码
url = 'https://www.sohu.com/a/362733977_120067802'
response = requests.get(url).content.decode()
# 解析网页
source_code = html.fromstring(response)
# 提取外层节点
html_node = source_code.xpath('//*/article[@class="article"]')
html_node
# 提取字段行
row_key = html_node[0].xpath('table/tbody/tr/th/text()')
row_key
# 提取行数据
row_value = html_node[0].xpath('table/tbody/tr/td/text()')
row_value
client = pymongo.MongoClient('mongodb://localhost:27017')# mongodb://127.0.0.1:27017
client
db = client['fiction']
collection = db['fiction_lab']
str_1 = {
"字段行":row_key,
"行数据":row_value
}
result = collection.insert(str_1)
print(result)
#100强csv
import csv
import requests
import re
import os.path
from lxml import html
url = 'https://www.sohu.com/a/362733977_120067802'
response = requests.get(url).content.decode()
model1 = '-title">.*?<h1>(.*?)<span.*?'
title = re.findall(model1, response, re.S)[0].replace('\n', '').strip() + '.csv'
csv_name = os.path.join(r'E:\爬虫', title)
source_code = html.fromstring(response)
html_node = source_code.xpath('//*/article[@class="article"]')
row_key = html_node[0].xpath('table/tbody/tr/th/text()')
row_value = html_node[0].xpath('table/tbody/tr/td/text()')
row_list = []
row_list.append(row_key)
temp_list = []
for value in row_value:
temp_list.append(value)
if len(temp_list) == 4:
row_list.append(temp_list)
temp_list=[]
with open(csv_name, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(row_list)