- 使用requests库
- 使用re模块清洗响应数据
从便民网站爬取常用电话,并将其写入文件,以便查询
代码如下:
"""
topic:爬取“便民查询网上的常用号码”
author:小灵子
date:2019-5-31
思路:先从开发者工具找到相应请求头、url等等
<tr bgcolor="#EFF7F0">
<td>匪警</td>
<td>110</td>
</tr>
"""
import re
import requests
import json
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Ap\
pleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Sa\
fari/537.36"
}
response = requests.get("http://changyongdianhuahaoma.51240.com/",headers=headers)
text = response.text
pattern = r'<tr bgcolor=".*?">\s{1,}<td>(.+?)</td>\s{1,}<td>(.+?)</td>\s+</tr>'
stuffs = re.findall(pattern, text)
data_dict = {}
for stuff in stuffs:
print(stuff)
data_dict[stuff[0]] = stuff[1]
try:
with open("D:/testDrectory/data", "w") as f:
json.dump(data_dict,f)
except Exception as e:
print(e)
with open("D:/testDrectory/data", "rb") as f:
dict = json.load(f)
name = input("输入公司名称:")
print("电话号码:",dict[name])