python3 编写Url采集器(Google,Baidu,Bing )

简单的实现了三个搜索引擎的Url采集。代码写的不够简洁,复用率低,大佬别笑。

考虑到人机验证就不加入多线程模块,慢慢跑也好过被拦截(菜,人机验证绕不过)。

百度的url需要进行二次访问才能抓取到原始的URL。

import requests,re
from urllib import parse
from time import sleep,time

'''URL采集器Author:Char0n'''

header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0'}

def google(keyword,pagenumber):
    time_stamp = str(time())
    try:
        for page in range(0,(pagenumber-1)*10+1,10):
            google_url = 'https://www.google.com/search?q={}&start={}'.format(keyword,page)
            html = requests.get(google_url)
            patt = re.compile('"><a href="/url\?q=(.*?)&amp;sa=')
            url_list = re.findall(patt,html.text)
            ext(url_list,time_stamp)
            sleep(5)
    except requests.exceptions.ProxyError:
        print('请求被拦截,可能需要进行人机验证')

def baidu(keyword,pagenumber):
    time_stamp = str(time())
    try:
        for page in range(0,(pagenumber-1)*10+1,10):
            baidu_url = 'https://www.baidu.com/s?wd={}&pn={}'.format(keyword,page)
            html = requests.get(baidu_url,headers=header)
            patt = re.compile('"><a target="_blank" href="(.*?)" class="')
            url_list = re.findall(patt,html.text)
            baidu_ext(url_list,time_stamp)
            sleep(5)
    except requests.exceptions.ProxyError:
        print('请求被拦截,可能需要进行人机验证')

def bing(keyword,pagenumber):
    time_stamp = str(time())
    try:
        for page in range(1,pagenumber*10+1,10):
            bing_url = 'https://cn.bing.com/search?q={}&go=%e6%90%9c%e7%b4%a2&qs=ds&mkt=zh-CN&first={}&FORM=PERE'.format(keyword,page)
            html = requests.get(bing_url,headers=header)
            patt = re.compile('<a target="_blank" href="(.*?)" h=')
            url_list = re.findall(patt,html.text)
            ext(url_list,time_stamp)
            sleep(5)
    except requests.exceptions.ProxyError:
        print('请求被拦截,可能需要进行人机验证')

def ext(url_list,time_stamp):
    try:
        for url in url_list:
            url = parse.unquote(url)
            with open('{}.txt'.format(time_stamp),'a') as f:
                f.write(url+'\n')
                print('[+]'+url)
    except:
        print('请求超时')

#百度采集的URL需要经过二次访问才能获得原ip
def baidu_ext(url_list,time_stamp):
    try:
        for url in url_list:
            again = requests.get(url,headers=header,timeout=5.0)
            with open('{}.txt'.format(time_stamp),'a') as f:
                f.write(again.url+'\n')
                print('[+]'+again.url)
                sleep(1)
    except:
        print('请求超时')

def main(Search_engine,Search_key,Search_page):
    if Search_engine == 1:
        google(Search_key,Search_page)
    elif Search_engine == 2:
        bing(Search_key,Search_page)
    elif Search_engine == 3:
        baidu(Search_key,Search_page)
    else:
        print('请输入正确的编号')

if __name__ == '__main__':
    print('''Instructions:URL采集,请求超时可能是触发了搜索引擎的人机验证。''')
    Search_engine = input('请输入调用的搜索引擎编号[1 : Google][2 : Bing][3 : BaiDu]: ')
    Search_key = input('请输入关键词: ')
    Search_page = input('请输采集页数(10条/1页): ')
    main(int(Search_engine),Search_key,int(Search_page))

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值