Python---爬虫小例子

最新推荐文章于 2024-01-28 00:00:35 发布

qq_40927742

最新推荐文章于 2024-01-28 00:00:35 发布

阅读量72

点赞数

分类专栏： old

本文链接：https://blog.csdn.net/qq_40927742/article/details/97614428

版权

old 专栏收录该内容

11 篇文章 0 订阅

订阅专栏

爬虫bs4

import requests

from bs4 import BeautifulSoup

url = 'https://www.baidu.com/'

html = requests.get(url)
html.encoding='utf-8'
print(html.encoding)
html = html.text


print(type(html))

sp = BeautifulSoup(html,'html.parser')

print(type(sp))

print(sp.select('title'))

爬虫lxml

import requests
from lxml import etree
name = '篮球'
kw = {'kw':name}

url = 'https://tieba.baidu.com/f?'
# https://tieba.baidu.com/f?kw=%E4%B8%96%E7%95%8C%E6%9D%AF

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}


html = requests.get(url,params=kw,headers=headers)
html.encoding='utf-8'
print(type(html.text))

html = html.text
html_doc=etree.HTML(html)
print(type(html_doc))
print(html_doc.xpath("//a/@href"))

print(html_doc.xpath("//a/text()"))

# html_lists = html.splitlines()
#
#
# for row in html_lists:
#     print(row)

网站pm2.5数据，58同城

import requests
from lxml import etree
from bs4 import BeautifulSoup
# # pm2.5
url = 'http://www.pm25x.com/city/wenzhou.htm'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}
response = requests.get(url,headers=headers)
response.encoding = 'utf-8'
response = response.text
sp = BeautifulSoup(response,'html.parser')
# print(doc.xpath('//table/tbody/tr[2]/td[4]/text()'))
print("温州瓯海pm2.5:")
list = sp.select('table > tr > td:nth-of-type(4)')
list1 = [span.get_text() for span in list]
print(list1[0])

# # 租房子
url2 = 'http://hz.58.com/chuzu/?utm_source=sem-sales-baidu-pc&spm=57648845030.14911346991&utm_campaign=sell&utm_medium=cpc&showpjs=pc_fg'
headers2 = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}
response2 = requests.get(url2,headers=headers2)
response2.encoding = 'utf-8'
response2 = response2.text
doc = etree.HTML(response2)
title = doc.xpath('//li[@sortid="1531411205000"]/div[2]/h2/a/text()')
# title2 = doc.xpath('//li[@sortid="1531411205000"]/div[2]/p/text()')
price = doc.xpath('//li[@sortid="1531411205000"]/div[3]/div/b/text()')
print("58同城租房间:")
for i in range(0,len(title)):
    print(title[i]+price[i]+"元/月")

豆瓣影评写入文件

from lxml import etree
import requests
import time

with open('yinping','w',encoding='utf-8') as f:
        url = 'https://movie.douban.com/review/best/'
        headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}

        data = requests.get(url,headers=headers).text

        doc=etree.HTML(data)
        title = doc.xpath('//*[@id="content"]/div/div[1]/div/div/div/div/h2/a/text()')
        href2 = doc.xpath('//*[@id="content"]/div/div[1]/div/div/div/div/h2/a/@href')

        for i in range(0, len(title)):
        #     print(title[i])
            url2 = href2[i]
            response2 = requests.get(url2, headers=headers)
            response2.encoding = 'utf-8'
            response2 = response2.text
            doc2 = etree.HTML(response2)
            text = doc2.xpath('//p/text()')
            f.write("{},{}\n".format(title[i],text))

qq_40927742

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Python---爬虫小例子

爬虫bs4import requestsfrom bs4 import BeautifulSoupurl = 'https://www.baidu.com/'html = requests.get(url)html.encoding='utf-8'print(html.encoding)html = html.textprint(type(html))sp = Bea...
复制链接

扫一扫