爬虫bs4
import requests
from bs4 import BeautifulSoup
url = 'https://www.baidu.com/'
html = requests.get(url)
html.encoding='utf-8'
print(html.encoding)
html = html.text
print(type(html))
sp = BeautifulSoup(html,'html.parser')
print(type(sp))
print(sp.select('title'))
爬虫lxml
import requests
from lxml import etree
name = '篮球'
kw = {'kw':name}
url = 'https://tieba.baidu.com/f?'
# https://tieba.baidu.com/f?kw=%E4%B8%96%E7%95%8C%E6%9D%AF
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
html = requests.get(url,params=kw,headers=headers)
html.encoding='utf-8'
print(type(html.text))
html = html.text
html_doc=etree.HTML(html)
print(type(html_doc))
print(html_doc.xpath("//a/@href"))
print(html_doc.xpath("//a/text()"))
# html_lists = html.splitlines()
#
#
# for row in html_lists:
# print(row)
网站pm2.5数据,58同城
import requests
from lxml import etree
from bs4 import BeautifulSoup
# # pm2.5
url = 'http://www.pm25x.com/city/wenzhou.htm'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}
response = requests.get(url,headers=headers)
response.encoding = 'utf-8'
response = response.text
sp = BeautifulSoup(response,'html.parser')
# print(doc.xpath('//table/tbody/tr[2]/td[4]/text()'))
print("温州瓯海pm2.5:")
list = sp.select('table > tr > td:nth-of-type(4)')
list1 = [span.get_text() for span in list]
print(list1[0])
# # 租房子
url2 = 'http://hz.58.com/chuzu/?utm_source=sem-sales-baidu-pc&spm=57648845030.14911346991&utm_campaign=sell&utm_medium=cpc&showpjs=pc_fg'
headers2 = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}
response2 = requests.get(url2,headers=headers2)
response2.encoding = 'utf-8'
response2 = response2.text
doc = etree.HTML(response2)
title = doc.xpath('//li[@sortid="1531411205000"]/div[2]/h2/a/text()')
# title2 = doc.xpath('//li[@sortid="1531411205000"]/div[2]/p/text()')
price = doc.xpath('//li[@sortid="1531411205000"]/div[3]/div/b/text()')
print("58同城租房间:")
for i in range(0,len(title)):
print(title[i]+price[i]+"元/月")
豆瓣影评写入文件
from lxml import etree
import requests
import time
with open('yinping','w',encoding='utf-8') as f:
url = 'https://movie.douban.com/review/best/'
headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}
data = requests.get(url,headers=headers).text
doc=etree.HTML(data)
title = doc.xpath('//*[@id="content"]/div/div[1]/div/div/div/div/h2/a/text()')
href2 = doc.xpath('//*[@id="content"]/div/div[1]/div/div/div/div/h2/a/@href')
for i in range(0, len(title)):
# print(title[i])
url2 = href2[i]
response2 = requests.get(url2, headers=headers)
response2.encoding = 'utf-8'
response2 = response2.text
doc2 = etree.HTML(response2)
text = doc2.xpath('//p/text()')
f.write("{},{}\n".format(title[i],text))