话不多少就是上代码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020-03-17 21:16
# @Author : 蓝狼
# @File : star_data_scrapy.py
# @Desc : 明星资料抓取
import time
import pymysql
import requests
import urllib3
from pyquery import PyQuery as pq
urllib3.disable_warnings()
mysql_config = {
'host': 'x x x',
'port': 3306,
'user': 'xxx',
'passwd': 'xxx',
'db': 'web2717',
'charset': 'utf8mb4'
}
conn = pymysql.connect(**mysql_config)
cursor = conn.cursor()
# 耗时装饰器
def time_usage(func):
def clocked(*args, **kwargs):
start = time.time()
result = func(*args, **kwargs)
end = time.time()
print(func.__name__, end-start)
return result
return clocked
def build_url(*args):
"""
构建访问地址
:param args:
:return:
"""
url = 'https://www.2717.com/star/'
path = '_'.join(args)
return url + path
def save_data_to_mysql(lists):
sql = '''
insert into `star_basic`(name,sex,portrait,location,detail_url)
values('{name}',{sex},'{portrait}','{location}','{url}')
'''.format(**lists)
print(sql)
try:
cursor.execute(sql)
last_id = cursor.lastrowid
conn.commit()
except Exception as e:
print(f'insert star_basic error.{e}')
return False
return last_id
def fetch_star_list(url, country, sex):
"""
抓取明星列表
:return:
"""
res = requests.get(url, verify=False)
res.encoding = 'GBK'
# res.encoding = res.apparent_encoding
doc = pq(res.text)
lists = doc('#a_selectbox2 li')
print(lists)
if lists is None:
return False
ls = []
for item in lists.items():
info = {
'name': item.find('a').attr('title'),
'sex': 1 if sex == 'nan' else 0,
'url': item.find('a').attr('href'),
'portrait': item.find('img').attr('src'),
'location': country
}
ls.append(info)
return ls
@time_usage
def main():
countries = {
'港澳': 'gangtai',
'日韩': 'rihan',
'欧美': 'oumei',
'大陆': 'dalu',
'其他': 'qita'
}
sexes = ('nan', 'nv')
letters = [chr(letter).lower() for letter in range(65,91)]
for name, country in countries.items():
print('###'*10, country, '###'*10)
for sex in sexes:
print('###'*5, sex, '###'*5)
for letter in letters:
print('###'*3, letter, '###'*3)
url = build_url(country, sex, letter)
print(f'url:{url}')
lists = fetch_star_list(url, name, sex)
if not lists:
print(f'empty data.')
continue
print(lists)
for ls in lists:
ret = save_data_to_mysql(ls)
if ret:
print(f'save data success. id:{ret}')
else:
print('save data failure')
if __name__ == '__main__':
print('crawling start data staring...')
main()
print('crawling ended')
最终抓取效果:
下次,通过这里面的详情连接,抓取更多信息