使用Python网络爬虫爬取Github每周趋势

最新推荐文章于 2023-05-30 14:45:00 发布

长胡子~

最新推荐文章于 2023-05-30 14:45:00 发布

阅读量347

点赞数

文章标签： Python 爬虫

本文链接：https://blog.csdn.net/qq_41384994/article/details/88807409

版权

由前段时间实验室需要我每周发布一下github上的趋势，于是写了个简单的爬虫，入门级比较简单。

import requests
import bs4
import re
import translator
import urllib.parse, urllib.request
import urllib
import json
import time
def get_html(url):
    '''
    封装请求
    '''
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36',
        'ContentType':
        'text/html; charset=utf-8',
        'Accept-Encoding':
        'gzip, deflate, sdch',
        'Accept-Language':
        'zh-CN,zh;q=0.8',
        'Connection':
        'keep-alive',
    }
    try:
        htmlcontet = requests.get(url, headers=headers, timeout=1)
        htmlcontet.raise_for_status()
        htmlcontet.encoding = 'utf-8'
        return htmlcontet.text
    except:
        return " 请求失败 "

"""爬取，正则"""
def get_content(url):

    weather_list_a= []
    weather_list_b= []
    html = get_html(url)
    soup = bs4.BeautifulSoup(html, 'lxml')
    """爬取名称"""
    content_ul1 = soup.findAll('div',class_="d-inline-block col-9 mb-1")
    for content in content_ul1:
        try:

            weather = content.find("a").text.strip().strip('\'')
            weather_list_a.append(weather)
        except:
            print('查询不到')
    """爬取简介"""
    content_ul2 = soup.findAll('div',class_="py-1")
    for content in content_ul2:
        try:
            txt=content.find('p').text.strip()
            weather= translateYoudao(txt)
            print(txt)
            print(weather)
            weather_list_b.append(weather)
        except:
            print('查询不到')

    url1=[]                                                                    #网址加工
    content_ul1 = soup.findAll('div', class_="d-inline-block col-9 mb-1")
    for content in content_ul1:
            a = content.find("a").text
            a=blank(a)
            a="https://github.com/"+a
            url1.append(a)

    for i in range(25):
        with open("github本周趋势.txt"+time.clock(),"x") as fp:
            fp.writelines(weather_list_a[i])
            fp.writelines(weather_list_b[i])
            fp.writelines(url1[i])

"""去空格"""
def blank(a):
    list_a=[]
    c=''
    list_a=a.split("/")
    for i in range(len(list_a)):
        list_a[i]=list_a[i].strip()
    for i in range(len(list_a)):
        c=c+list_a[i]+"/"
    return c
url_youdao = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=' \
                 'http://www.youdao.com/'
dict = {}
dict['from'] = 'AUTO'
dict['to'] = 'AUTO'
dict['smartresult'] = 'dict'
dict['client'] = 'fanyideskweb'
dict['salt'] = '1500349255670'
dict['sign'] = '997742c66698b25b43a3a5030e1c2ff2'
dict['doctype'] = 'json'
dict['version'] = '2.1'
dict['keyfrom'] = 'fanyi.web'
dict['action'] = 'FY_BY_CL1CKBUTTON'
dict['typoResult'] = 'true'
"""调用有道进行翻译"""
def translateYoudao(text):
    
    global dict
    dict['i'] = text
    data = urllib.parse.urlencode(dict).encode('utf-8')
    response = urllib.request.urlopen(url_youdao, data)
    content = response.read().decode('utf-8')
    data = json.loads(content)
    result = data['translateResult'][0][0]['tgt']
    return  result

if __name__ == '__main__':
    url = 'https://github.com/trending?since=weekly'
    get_content(url)

这是爬github每周趋势，同时对爬取的内容进行翻译，但是目前的问题是他会把中文翻译成乱码，

如果你有想法或者解决办法，欢迎交流qq2418916003

长胡子~

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
使用Python网络爬虫爬取Github每周趋势

由前段时间实验室需要我每周发布一下github上的趋势，于是写了个简单的爬虫，入门级比较简单。import requestsimport bs4import reimport translatorimport urllib.parse, urllib.requestimport urllibimport jsonimport timedef get_html(url):...
复制链接

扫一扫