目录:
- 从内涵段子网站爬取脑筋急转弯并进行文本解析保存为 .txt 格式
- 从有道翻译网站上使用 POST 请求方式获取数据, 输入 汉/英 显示翻译的 英/汉
- 猫眼电影 top100 csv文件 排名 / 电影名称 / 主演 / 上映时间
- 链家二手房 房源数据
# -*- coding: utf-8 -*-
'''
从内涵段子网站爬取脑筋急转弯并进行文本解析保存为 .txt 格式
url:https://www.neihan8s.com/njjzw//
'''
import re
import urllib
import time
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1"}
baseurl = "https://www.neihan8s.com/njjzw//"
for i in range(1,20):
if i == 1:
url = baseurl+"index" +".html"
else:
url = url = baseurl+"index_%d"%i +".html"
req = urllib.request.Request(url,headers = headers)
res = urllib.request.urlopen(req)
time.sleep(0.5)
html = res.read().decode()
pattern = '<div class="text-column-item box box-790">.*? title="(.*?)".*?<div class="desc">(.*?)</div>.*?<div class="bottom">'
p = re.compile(pattern,re.S)
r = p.findall(html)
for i in r:
str1 = i[0]+" >>> "+i[1].lstrip('\u3000\u3000')
with open("脑筋急转弯.txt",'a') as f:
f.write(str1)
f.write('\n\n')
# -*- coding: utf-8 -*-
"""
从有道翻译网站上使用 urllib 中的POST方法 请求方式获取数据
输入 汉/英 显示翻译的 英/汉
"""
import urllib.request
import urllib.parse
import json
while True:
#表单编码
print("===============================")
key = input("请输入要翻译的内容>>>")
data = {'i': key,
'from': 'AUTO',
'to': 'AUTO',
'smartresult': 'dict',
'client': 'fanyideskweb',
'salt': '15706089102276',
'sign': 'fe13485d0dd7803b64d5a239f8d65dce',
'ts': '1570608910227',
'bv': '4aa7828b641c5e2587e46a4b35eb3523',
'doctype': 'json',
'version': '2.1',
'keyfrom': 'fanyi.web',
'action': 'FY_BY_REALTlME'}
#字符串 i=key&from=auto...
data = urllib.parse.urlencode(data)
data = bytes(data,'utf-8')
#发请求,获取响应
#为POST的地址 有道词典需要删除url中的 '_o'
url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule"
headers = {"User-Agent":"Mozilla/5.0"}
req = urllib.request.Request(url,data = data,headers = headers)
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
html = json.loads(html)
print('>>>>>>',html['translateResult'][0][0]['tgt'])
"""
从有道翻译网站上使用 requests.post()方法 请求方式获取数据
输入 汉/英 显示翻译的 英/汉
"""
import requests
import json
while True:
#表单编码
print("===============================")
key = input("请输入要翻译的内容>>>")
#post方法要求data为字典格式
data = {'i': key,
'from': 'AUTO',
'to': 'AUTO',
'smartresult': 'dict',
'client': 'fanyideskweb',
'salt': '15706089102276',
'sign': 'fe13485d0dd7803b64d5a239f8d65dce',
'ts': '1570608910227',
'bv': '4aa7828b641c5e2587e46a4b35eb3523',
'doctype': 'json',
'version': '2.1',
'keyfrom': 'fanyi.web',
'action': 'FY_BY_REALTlME'}
url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule"
headers = {"User-Agent":"Mozilla/5.0"}
res = requests.post(url,data=data,headers=headers)
res.encoding = "utf-8"
html = res.text
html = json.loads(html)
print('>>>>>>',html['translateResult'][0][0]['tgt'])
# -*- coding:utf-8 -*-
'''
猫眼电影 top100 csv文件
排名 / 电影名称 / 主演 / 上映时间
'''
import urllib
import re
import time
import csv
class maoYanSpider(object):
def __init__(self):
self.baseurl = 'https://maoyan.com/board/4?offset='
#offset = 0,10,20,...90
self.headers = {"User-Agent":"Mozilla/5.0"}
#获取网页
def getHtml(self,url):
req = urllib.request.Request(url,headers = self.headers)
res = urllib.request.urlopen(req)
html = res.read().decode()
self.reHtml(html)
#解析网页
def reHtml(self,html):
pattern = '<i class="board-index board-index-(.*?)">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p>'
p = re.compile(pattern,re.S)
r = p.findall(html)
time.sleep(0.1)
self.saveInfo(r)
#保存信息
def saveInfo(self,r):
with open("猫眼电影TOP100.csv","a",newline = "") as f:
writer = csv.writer(f)
for i in r:
writeInfo = [i[0],i[1],i[2].strip().lstrip("主演:"),i[3].lstrip("上映时间:")]
writer.writerow(writeInfo)
#调用主程序
def workOn(self):
with open("猫眼电影TOP100.csv","a",newline = "") as f:
writer = csv.writer(f)
writer.writerow(["排名","电影名称","主演","上映时间"])
for i in range(0,10):
url = self.baseurl + str(i*10)
self.getHtml(url)
if __name__ == "__main__":
maoyan = maoYanSpider()
maoyan.workOn()
# -*- coding: utf-8 -*-
"""
mail: dongmouren@live.com
使用 requests 模块爬取链家二手房上的重庆渝中区房源数据
url:
https://cq.lianjia.com/ershoufang/yuzhong/pg1/
https://cq.lianjia.com/ershoufang/yuzhong/pg2/
...
"""
import requests
import re
import csv
class lianJiaSpider(object):
def __init__(self):
self.baseurl = "https://cq.lianjia.com/ershoufang/yuzhong/"
self.headers = {"User-Agent":"Mozilla/5.0"}
self.title = ["标题","位置","房型","面积","价格"]
#获取网页数据
def getHtml(self,url):
res = requests.get(url,headers = self.headers)
res.encoding="utf-8"
self.reHtml(res.text)
#解析网页数据
def reHtml(self,text):
pattern = '<div class="price"><span>(.*?)</span>.*?data-el="ershoufang">(.*?)</a><div class="info">(.*?)<span>/</span>(.*?)<span>/</span>(.*?)<span>'
p = re.compile(pattern,re.S)
result = p.findall(text)
self.saveData(result)
#保存解析数据
def saveData(self,result):
with open("链家二手房数据.csv","a",newline = "") as f:
writer = csv.writer(f)
for data in result:
writedata = [data[1],data[2],data[3],data[4],data[0]+"万"]
writer.writerow(writedata)
#主函数
def workOn(self):
with open("链家二手房数据.csv","a",newline = "") as f:
writer = csv.writer(f)
writer.writerow(self.title)
for i in range(1,5):
url = self.baseurl+"pg%d/"%i
self.getHtml(url)
if __name__ == "__main__":
test = lianJiaSpider()
test.workOn()