python之pyquery

话不多少就是上代码

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020-03-17 21:16
# @Author  : 蓝狼
# @File    : star_data_scrapy.py
# @Desc    : 明星资料抓取
import time

import pymysql
import requests
import urllib3
from pyquery import PyQuery as pq


urllib3.disable_warnings()

mysql_config = {
    'host': 'x x x',
    'port': 3306,
    'user': 'xxx',
    'passwd': 'xxx',
    'db': 'web2717',
    'charset': 'utf8mb4'
}
conn = pymysql.connect(**mysql_config)

cursor = conn.cursor()

# 耗时装饰器
def time_usage(func):
    def clocked(*args, **kwargs):
        start = time.time()
        result = func(*args, **kwargs)
        end = time.time()
        print(func.__name__, end-start)
        return result
    return clocked


def build_url(*args):
    """
    构建访问地址
    :param args:
    :return:
    """
    url = 'https://www.2717.com/star/'
    path = '_'.join(args)
    return url + path


def save_data_to_mysql(lists):
    sql = '''
        insert into `star_basic`(name,sex,portrait,location,detail_url) 
        values('{name}',{sex},'{portrait}','{location}','{url}')
    '''.format(**lists)
    print(sql)
    try:
        cursor.execute(sql)
        last_id = cursor.lastrowid
        conn.commit()
    except Exception as e:
        print(f'insert star_basic error.{e}')
        return False

    return last_id


def fetch_star_list(url, country, sex):
    """
    抓取明星列表
    :return:
    """
    res = requests.get(url, verify=False)
    res.encoding = 'GBK'
    # res.encoding = res.apparent_encoding
    doc = pq(res.text)
    lists = doc('#a_selectbox2 li')
    print(lists)
    if lists is None:
        return False

    ls = []
    for item in lists.items():
        info = {
            'name': item.find('a').attr('title'),
            'sex': 1 if sex == 'nan' else 0,
            'url': item.find('a').attr('href'),
            'portrait': item.find('img').attr('src'),
            'location': country
        }
        ls.append(info)

    return ls


@time_usage
def main():

    countries = {
        '港澳': 'gangtai',
        '日韩': 'rihan',
        '欧美': 'oumei',
        '大陆': 'dalu',
        '其他': 'qita'
    }

    sexes = ('nan', 'nv')

    letters = [chr(letter).lower() for letter in range(65,91)]

    for name, country in countries.items():
        print('###'*10, country, '###'*10)

        for sex in sexes:
            print('###'*5, sex, '###'*5)

            for letter in letters:
                print('###'*3, letter, '###'*3)
                url = build_url(country, sex, letter)
                print(f'url:{url}')
                lists = fetch_star_list(url, name, sex)
                if not lists:
                    print(f'empty data.')
                    continue

                print(lists)
                for ls in lists:
                    ret = save_data_to_mysql(ls)
                    if ret:
                        print(f'save data success. id:{ret}')
                    else:
                        print('save data failure')



if __name__ == '__main__':
    print('crawling start data staring...')
    main()
    print('crawling ended')

最终抓取效果:

 

下次,通过这里面的详情连接,抓取更多信息

 

 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值