自用爬虫练习

# 第一题
import re

strs = """<html>
<head>
    <meta charset="UTF-8">
    <title>学号尾号为9</title>
</head>
<body>
    <h1>呼和浩特:蔬菜价格同比去年上涨37.71%</h1>
    <p>近日,全国蔬菜价格普遍上涨,呼和浩特市民的“菜篮子”怎么样?</p>
    <p>11月23~24日,记者走访了美通、东瓦窑、百家润、笙鲜农场、北京华联等各大批发市场、超市看到,受季节、天气等多重因素影响,一些生活必需品略有上涨,但供应充足,品种丰富。</p>
    <p>蔬菜价格同比去年上涨37.71%</p>
    <p>在东瓦窑农副产品批发市场,各类蔬菜琳琅满目,黄瓜5.0元/斤,西红柿4.5元/斤,本地豆角7.5元/斤,大白菜1.5元/斤,白萝卜2.0元/斤,土豆1.5元/斤。</p>
    <p>商贩们说,今年菜价相比去年略高一些,其中叶类菜、豆角、辣椒、黄瓜等价格涨幅较高。</p>
    <p>在百家润等生鲜超市,记者了解到,部分叶类菜的价格高于往年,其中,菠菜、油麦菜6.98元/斤,油菜4.20元/斤,芹菜4元/斤。</p>
</body>
</html>"""

model1 = '<h1>(.*?)</h1>'
model2 = '<p>(.*?)</p>'
result1 = re.findall(model1, strs, re.S)
result2 = re.findall(model2, strs, re.S)
print(result1)
print(result2)

# 第二题
import requests
import re
import csv
#获取网页源代码
url = r'https://tieba.baidu.com/p/7544967041'
html = requests.get(url).content.decode()
print(html)

#获取用户名
username_list = re.findall('username="(.*?)".*?class=', html, re.S)
print(username_list)

# 查看“发帖内容”
content_list = re.findall('class="d_post_content.*?j_d_post_content.*?style="display:;">(.*?)</div>',html,re.S)
print(content_list)

#获取发帖时间
reply_time_list = re.findall('date&quot;:&quot;(20.*?)&quot',html,re.S)
print(reply_time_list)

data_list = []
for i in range(len(username_list)):
    result = {'username':username_list[i],
             'content':content_list[i],
             'reply_time':reply_time_list[i]
             }
    data_list.append(result)

with open(r'D:\Temp\tieba123456.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['username', 'content', 'reply_time'])
    writer.writeheader()
    writer.writerows(data_list)

# 第三题

import lxml.html
import requests
url = r'https://www.kanunu8.com/book2/11135/index.html'
response = requests.get(url)
html_txt = response.content.decode(encoding="gbk")
selector = lxml.html.fromstring(html_txt)

info1 = selector.xpath('//div[@class="main"]')           
print(info1)
info2 = info1[0].xpath('//div[@class="col-left"]')     
print(info2)   
info3 = info2[0].xpath('//div[@class="book"]')        
print(info3)
info4 = info3[0].xpath('//dl')           
print(info4)
info5 = info4[0].xpath('//dd')
print(info5)

info6 = []
for element in info5:
    # result = element.xpath('./a/@href')
    # 或者
    result = element.xpath('a/@href')
    if result:
        info6.append(result[0])
# 返回一个列表对象
# 打印列表
for u in info6:
    print(u)

info7 = []
for element in info6:    
    info7.append('https://www.kanunu8.com/book2/10986/'+ element)
print(info7)

#第三题222
import requests
import lxml.html
import re
import os
url = r'https://www.kanunu8.com/book2/11137/index.html'
response = requests.get(url)
html_cede = response.content.decode(encoding="gbk")
selector = lxml.html.fromstring(html_cede)
info =selector.xpath('//div[@class="main"]/div[@class="col-left"]/div[@class="book"]/dl/dd/a/@href')
info1 = selector.xpath('//div[@class="main"]')
info1
info2 = info1[0].xpath('./div[@class="col-left"]')
info2
info3 = info2[0].xpath('./div[@class="book"]')
info3
info4 = info3[0].xpath('./dl')
info4
info5 = info4[0].xpath('//dd')
info5
info6 = []
for temp in info5:
    result = temp.xpath('./a/@href')
    if result:
        info6.append(result[0])
        info6
info7 = []

for element in info:
    result.append('https://www.kanunu8.com/book2/11137/'+ element)
for u in result:
    print(u)

#第3题数据库
import csv
import requests
import re
import os.path
from lxml import html

#运行mongdb数据库
win+r
mongo

# 首先获取网页代码
url = r"网址"
response = requests.get(url).content.decode()
# 解析网页
ource_code = html.fromstring(response)
#按照从大到小的顺序提取节点
info1 = selector.xpath('//div[@class="main"]')
info2 = info1[0].xpath('./div[@class="col-left"]')
info3 = info2[0].xpath('./div[@class="book"]')
info4 = info3[0].xpath('./dl')
info5 = info4[0].xpath('//dd')
info6 = []
for temp in info5:
result = temp.xpath('./a/@href')
if result:
info6.append(result[0])
info7 = []
#章节所以网页
for element in info6:
    info7.append('https://www.kanunu8.com/book2/10986/'+ element)
#循环爬取每页信息
for i in range(18):
    chapter = requests.get(info7[i])
    chapter_txt = chapter.content.decode(encoding='gbk')
    content_node = lxml.html.fromstring(chapter_txt)
#爬取章节内容
    title = content_node.xpath('//div[@id="Article"]/h1/text()')
#爬取作者
    auther = content_node.xpath('//div[@id="Article"]/h1/span/strong/text()')
#地址
    s='https://www.kanunu8.com/book2/10986/'+ element
#爬取文章内容
    txt_list = content_node.xpath('//div[@class="text"]/p/text()')
#存入数据库
    student1 = {
    '章节': title    ,
     '作者': auther, 
    '地址': s,
     '文章': txt_list }
    result =  collection.insert(student1)

#4
#  首先获取网页代码
url = '网址'
response = requests.get(url).content.decode()
# 解析网页
source_code = html.fromstring(response)
#  提取外层节点
html_node = source_code.xpath('//*/article[@class="article"]')
#   提取字段行
row_key = html_node[0].xpath('table/tbody/tr/th/text()')
#   提取行数据
row_value = html_node[0].xpath('table/tbody/tr/td/text()')


#存数据库
 #连接库
 client = pymongo.MongoClient('mongodb://localhost:27017')
 #指定库指定集
 db = client['库名']
 collection = db['集名']
 #存入
 str_1 = {
    "字段行":row_key,
    "行数据":row_value
    }
 result = collection.insert(str_1)

#100强
import requests
import pymongo
from lxml import html

#  首先获取网页代码
url = 'https://www.sohu.com/a/362733977_120067802'
response = requests.get(url).content.decode()

# 解析网页
source_code = html.fromstring(response)

#  提取外层节点
html_node = source_code.xpath('//*/article[@class="article"]')
html_node

#   提取字段行
row_key = html_node[0].xpath('table/tbody/tr/th/text()')
row_key

#   提取行数据
row_value = html_node[0].xpath('table/tbody/tr/td/text()')
row_value

client = pymongo.MongoClient('mongodb://localhost:27017')# mongodb://127.0.0.1:27017
client
db = client['fiction']
collection = db['fiction_lab']

str_1 = {
    "字段行":row_key,
    "行数据":row_value
    }
result = collection.insert(str_1)
print(result)

#100强csv
import csv
import requests
import re
import os.path
from lxml import html

url = 'https://www.sohu.com/a/362733977_120067802' 
response = requests.get(url).content.decode()

model1 = '-title">.*?<h1>(.*?)<span.*?' 
title = re.findall(model1, response, re.S)[0].replace('\n', '').strip() + '.csv'

csv_name = os.path.join(r'E:\爬虫', title)

source_code = html.fromstring(response)

html_node = source_code.xpath('//*/article[@class="article"]')

row_key = html_node[0].xpath('table/tbody/tr/th/text()')

row_value = html_node[0].xpath('table/tbody/tr/td/text()')

row_list = []
row_list.append(row_key)
temp_list = []
for value in row_value:
    temp_list.append(value)
    if len(temp_list) == 4:
        row_list.append(temp_list)
        temp_list=[]

with open(csv_name, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(row_list)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值