需求:需要寻找一批游戏或者教育网站地址
思路:1.先百度关键字得到一批结果;2.然后访问百度到的结果,根据网页内容关键字判断 是否是目标站
# -*- coding: utf-8 -*-
# @Time : 2018/8/7 13:02
# @Author : xiangchaoming
# @QQ : 239036082
# 引入浏览器驱动
from selenium import webdriver
# 引入Keys类包 发起键盘操作
from lxml import html
def keys():
f = open("keys.txt", "r", encoding="utf8")
fr = f.read()
# str转dict
return eval(fr)
def MainFunction():
temp = []
fw = open("urls.txt", "a")
brower = webdriver.Firefox()
brower.set_page_load_timeout(10)
brower.set_script_timeout(10) # 这两种设置都进行才有效
brower.delete_all_cookies()
for p in range(keys()["pages"]):
try:
# 访问百度
ku = "https://www.baidu.com/s?wd=" + keys()["key1"] + "&pn=" + str(10 * p)
brower.get(ku)
# 先找到百度到的一级连接
ps = brower.page_source
urls = html.etree.HTML(ps).xpath("//h3[@class='t']/a/@href")
for url in urls:
#print("一级域名:"+url)
try:
# 打开黄站,获取网站所有链接
brower.get(url)
webdata2 = brower.page_source
page2 = html.etree.HTML(webdata2)
# 将获得的链接作处理
a = brower.current_url.split("/")
su = a[0] + "//" + a[2]
if su not in temp:
# print("二级域名:" + su)
temp.append(su)
try:
# 打开黄站中的链接,然后根据网页中的关键词判断是否是赌博站
brower.get(su)
webdata3 = brower.page_source
for key in keys()["key2"]:
if webdata3.__contains__(key):
print("关键域名:" + su)
fw.write(su)
fw.write("\n")
fw.flush()
break
else:
continue
except Exception as e:
print("*" * 50)
print(e)
pass
hrefs2 = page2.xpath(r"//a/@href")
for href2 in hrefs2:
if href2.startswith("http"):
# 将获得的链接作处理
a = href2.split("/")
su = a[0] + "//" + a[2]
if su not in temp:
#print("二级域名:" + su)
temp.append(su)
try:
# 打开黄站中的链接,然后根据网页中的关键词判断是否是赌博站
brower.get(su)
webdata3 = brower.page_source
for key in keys()["key2"]:
if webdata3.__contains__(key):
print("关键域名:" + su)
fw.write(su)
fw.write("\n")
fw.flush()
break
else:
continue
except Exception as e:
print("*"*50)
print(e)
pass
except Exception as e:
print("#" * 50)
print(e)
pass
except Exception as e:
print("$" * 50)
print(e)
pass
fw.close()
MainFunction()
keys.txt :key1百度关键字,key2为网页内容关键字,pages表示爬取百度结果的前多少页
{
"key1": "炸金花",
"key2": ["捕鱼游戏", "新葡京", "棋牌"],
"pages": 100
}