Python---爬虫小例子

爬虫bs4

import requests

from bs4 import BeautifulSoup

url = 'https://www.baidu.com/'

html = requests.get(url)
html.encoding='utf-8'
print(html.encoding)
html = html.text


print(type(html))

sp = BeautifulSoup(html,'html.parser')

print(type(sp))

print(sp.select('title'))

爬虫lxml

import requests
from lxml import etree
name = '篮球'
kw = {'kw':name}

url = 'https://tieba.baidu.com/f?'
# https://tieba.baidu.com/f?kw=%E4%B8%96%E7%95%8C%E6%9D%AF

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}


html = requests.get(url,params=kw,headers=headers)
html.encoding='utf-8'
print(type(html.text))

html = html.text
html_doc=etree.HTML(html)
print(type(html_doc))
print(html_doc.xpath("//a/@href"))

print(html_doc.xpath("//a/text()"))

# html_lists = html.splitlines()
#
#
# for row in html_lists:
#     print(row)

网站pm2.5数据,58同城

import requests
from lxml import etree
from bs4 import BeautifulSoup
# # pm2.5
url = 'http://www.pm25x.com/city/wenzhou.htm'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}
response = requests.get(url,headers=headers)
response.encoding = 'utf-8'
response = response.text
sp = BeautifulSoup(response,'html.parser')
# print(doc.xpath('//table/tbody/tr[2]/td[4]/text()'))
print("温州瓯海pm2.5:")
list = sp.select('table > tr > td:nth-of-type(4)')
list1 = [span.get_text() for span in list]
print(list1[0])

# # 租房子
url2 = 'http://hz.58.com/chuzu/?utm_source=sem-sales-baidu-pc&spm=57648845030.14911346991&utm_campaign=sell&utm_medium=cpc&showpjs=pc_fg'
headers2 = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}
response2 = requests.get(url2,headers=headers2)
response2.encoding = 'utf-8'
response2 = response2.text
doc = etree.HTML(response2)
title = doc.xpath('//li[@sortid="1531411205000"]/div[2]/h2/a/text()')
# title2 = doc.xpath('//li[@sortid="1531411205000"]/div[2]/p/text()')
price = doc.xpath('//li[@sortid="1531411205000"]/div[3]/div/b/text()')
print("58同城租房间:")
for i in range(0,len(title)):
    print(title[i]+price[i]+"元/月")

豆瓣影评写入文件

from lxml import etree
import requests
import time

with open('yinping','w',encoding='utf-8') as f:
        url = 'https://movie.douban.com/review/best/'
        headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}

        data = requests.get(url,headers=headers).text

        doc=etree.HTML(data)
        title = doc.xpath('//*[@id="content"]/div/div[1]/div/div/div/div/h2/a/text()')
        href2 = doc.xpath('//*[@id="content"]/div/div[1]/div/div/div/div/h2/a/@href')

        for i in range(0, len(title)):
        #     print(title[i])
            url2 = href2[i]
            response2 = requests.get(url2, headers=headers)
            response2.encoding = 'utf-8'
            response2 = response2.text
            doc2 = etree.HTML(response2)
            text = doc2.xpath('//p/text()')
            f.write("{},{}\n".format(title[i],text))
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值