使用Python网络爬虫 爬取Github每周趋势

由前段时间实验室需要我每周发布一下github上的趋势,于是写了个简单的爬虫,入门级  比较简单。

import requests
import bs4
import re
import translator
import urllib.parse, urllib.request
import urllib
import json
import time
def get_html(url):
    '''
    封装请求
    '''
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36',
        'ContentType':
        'text/html; charset=utf-8',
        'Accept-Encoding':
        'gzip, deflate, sdch',
        'Accept-Language':
        'zh-CN,zh;q=0.8',
        'Connection':
        'keep-alive',
    }
    try:
        htmlcontet = requests.get(url, headers=headers, timeout=1)
        htmlcontet.raise_for_status()
        htmlcontet.encoding = 'utf-8'
        return htmlcontet.text
    except:
        return " 请求失败 "

"""爬取,正则"""
def get_content(url):

    weather_list_a= []
    weather_list_b= []
    html = get_html(url)
    soup = bs4.BeautifulSoup(html, 'lxml')
    """爬取名称"""
    content_ul1 = soup.findAll('div',class_="d-inline-block col-9 mb-1")
    for content in content_ul1:
        try:

            weather = content.find("a").text.strip().strip('\'')
            weather_list_a.append(weather)
        except:
            print('查询不到')
    """爬取简介"""
    content_ul2 = soup.findAll('div',class_="py-1")
    for content in content_ul2:
        try:
            txt=content.find('p').text.strip()
            weather= translateYoudao(txt)
            print(txt)
            print(weather)
            weather_list_b.append(weather)
        except:
            print('查询不到')

    url1=[]                                                                    #网址加工
    content_ul1 = soup.findAll('div', class_="d-inline-block col-9 mb-1")
    for content in content_ul1:
            a = content.find("a").text
            a=blank(a)
            a="https://github.com/"+a
            url1.append(a)

    for i in range(25):
        with open("github本周趋势.txt"+time.clock(),"x") as fp:
            fp.writelines(weather_list_a[i])
            fp.writelines(weather_list_b[i])
            fp.writelines(url1[i])

"""去空格"""
def blank(a):
    list_a=[]
    c=''
    list_a=a.split("/")
    for i in range(len(list_a)):
        list_a[i]=list_a[i].strip()
    for i in range(len(list_a)):
        c=c+list_a[i]+"/"
    return c
url_youdao = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=' \
                 'http://www.youdao.com/'
dict = {}
dict['from'] = 'AUTO'
dict['to'] = 'AUTO'
dict['smartresult'] = 'dict'
dict['client'] = 'fanyideskweb'
dict['salt'] = '1500349255670'
dict['sign'] = '997742c66698b25b43a3a5030e1c2ff2'
dict['doctype'] = 'json'
dict['version'] = '2.1'
dict['keyfrom'] = 'fanyi.web'
dict['action'] = 'FY_BY_CL1CKBUTTON'
dict['typoResult'] = 'true'
"""调用有道进行翻译"""
def translateYoudao(text):
    
    global dict
    dict['i'] = text
    data = urllib.parse.urlencode(dict).encode('utf-8')
    response = urllib.request.urlopen(url_youdao, data)
    content = response.read().decode('utf-8')
    data = json.loads(content)
    result = data['translateResult'][0][0]['tgt']
    return  result

if __name__ == '__main__':
    url = 'https://github.com/trending?since=weekly'
    get_content(url)

这是爬github每周趋势,同时对爬取的内容进行翻译,但是目前的问题是他会把中文翻译成乱码,

如果你有想法或者解决办法,欢迎交流qq2418916003

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值