爬取3499手游网下载地址信息
爬取游戏的下载地址和信息,爬取的信息存入到数据库中。
1、首先需要安装第三方库
requests,lxml,MySQLdb
2、先创建down_software数据库,创建youxi表
create table down_software.youxi
(
id int auto_increment
primary key,
yx_title varchar(255) null,
yx_bsc varchar(255) null,
yx_os varchar(255) null,
yx_type varchar(255) null,
comment_str text null,
yx_jietu text null,
yx_down_str varchar(255) null,
yx_head_portrait_url varchar(255) null,
url varchar(255) null
)
engine = MyISAM;
3、要爬取的网站https://www.34347.com,直接贴代码。
import requests
from lxml import etree
import MySQLdb
import time
def mySql(yx_title, yx_bsc, yx_os, yx_type, comment_str, yx_jietu, yx_down_str, yx_head_portrait_url, url):
# 打开数据库连接
db = MySQLdb.connect("localhost", "root", "root", "down_software", charset='utf8')
# 使用cursor()方法获取操作游标
cursor = db.cursor()
# SQL 插入语句
sql = """INSERT INTO youxi(yx_title, yx_bsc, yx_os, yx_type, comment_str, yx_jietu, yx_down_str, yx_head_portrait_url, url)
VALUES ('""" + yx_title + """', '""" + yx_bsc + """', '""" + yx_os + """',' """ + yx_type + """',' """ + comment_str + """',' """ + yx_jietu + """',' """ + yx_down_str + """',' """ + yx_head_portrait_url + """',' """ + url + """')"""
try:
# 执行sql语句
cursor.execute(sql)
# 提交到数据库执行
db.commit()
except:
# Rollback in case there is any error
db.rollback()
# 关闭数据库连接
db.close()
def down_info(url_i):
for i in url_i:
url = 'https://www.34347.com' + i
h1 = requests.get(url, data=data, headers=headers)
print(url)
html_info = etree.HTML(h1.text)
yx_title = html_info.xpath('/html/body/div[1]/div[4]/div[3]/div[1]/div/div[1]/dl/dt/h3/text()') # 游戏名
if len(yx_title) > 0:
yx_title = html_info.xpath('/html/body/div[1]/div[4]/div[3]/div[1]/div/div[1]/dl/dt/h3/text()')[0] # 游戏名
else:
break
re = check_url(yx_title)
if len(re) > 0:
break
# print(re)
# exit()
yx_head_portrait_url = html_info.xpath('/html/body/div[1]/div[4]/div[3]/div[1]/div/div[1]/div[1]/img/@src')[
0] # 头像url
yx_down = html_info.xpath('/html/body/div[1]/div[4]/div[3]/div[1]/div/div[1]/dl/dd[3]/a[.]/@href') # 下载链接
yx_bsc = html_info.xpath(
'/html/body/div[1]/div[4]/div[3]/div[1]/div/div[1]/dl/dd[2]/p[1]/span[.]/em/text()') # 游戏版本,文件大小,游戏厂商,下载次数
yx_os = html_info.xpath(
'/html/body/div[1]/div[4]/div[3]/div[1]/div/div[1]/dl/dd[2]/p[2]/span[1]/em[.]/text()') # 游戏平台
yx_type = html_info.xpath('/html/body/div[1]/div[4]/div[3]/div[1]/div/div[1]/dl/dd[2]/p[2]/span[2]/em/text()')[
0] # 游戏类型
yx_comment = html_info.xpath(
'/html/body/div[1]/div[4]/div[3]/div[1]/div/div[2]/div[2]/div[1]/div[1]/p[.]/text()') # 游戏简介
yx_jietu = html_info.xpath('//*[@id="x_img_viewer"]/div/ul/li[.]/img/@src') # 游戏截图
comment_str = ''
yx_bsc_str = ''
yx_os_str = ''
yx_jietu_str = ''
yx_down_str = ''
for keyj, j in enumerate(yx_bsc):
lent1 = len(yx_bsc)
if lent1 - 1 != keyj:
yx_bsc_str = yx_bsc_str + j + ','
else:
yx_bsc_str = yx_bsc_str + j
for keyk, k in enumerate(yx_os):
lent2 = len(yx_os)
if lent2 - 1 != keyk:
yx_os_str = yx_os_str + k + ','
else:
yx_os_str = yx_os_str + k
for keyi, i in enumerate(yx_comment):
lent3 = len(yx_comment)
if lent3 - 1 != keyi:
comment_str = comment_str + i + '\n'
else:
comment_str = comment_str + i
for keyn, n in enumerate(yx_jietu):
lent4 = len(yx_jietu)
if lent4 - 1 != keyn:
yx_jietu_str = yx_jietu_str + n + ','
else:
yx_jietu_str = yx_jietu_str + n
for keym, m in enumerate(yx_down):
lent5 = len(yx_down)
if lent5 - 1 != keym:
yx_down_str = yx_down_str + m + ','
else:
yx_down_str = yx_down_str + m
# 存入数据库
mySql(yx_title, yx_bsc_str, yx_os_str, yx_type, comment_str, yx_jietu_str, yx_down_str, yx_head_portrait_url, i)
# time.sleep(0.3)
def check_url( result_title):
# 查询数据库中是否有该名字
# 打开数据库连接
db = MySQLdb.connect("localhost", "root", "qinchaowei", "down_software", charset='utf8')
# 使用cursor()方法获取操作游标
cursor = db.cursor()
# SQL 查询语句
sql = """SELECT yx_title FROM youxi WHERE yx_title = '""" + result_title + """'"""
# 执行SQL语句
cursor.execute(sql)
# 获取所有记录列表
results = cursor.fetchall()
# 关闭数据库连接
db.close()
# exit()
return results
def pageinfo(num):
for page in range(int(num) + 1):
page_url = "https://www.34347.com/game/list-" + str(page) + "-0-0-0-0-0-0-0.html"
s = requests.get(page_url, data=data, headers=headers)
html = etree.HTML(s.text)
result_url = html.xpath('/html/body/div[1]/div[4]/div[2]/div[2]/ul/li[.]/h3/a/@href') # //代表获取子孙节点,*代表获取所有
result_title = html.xpath('/html/body/div[1]/div[4]/div[2]/div[2]/ul/li[.]/h3/a/text()') # //代表获取子孙节点,*代表获取所有
# print(result_url)
# exit()
down_info(result_url)
data = {'some': 'data'}
headers = {'content-type': 'application/json',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'}
s = requests.get('https://www.34347.com/game/list-0-0-0-0-0-0-0-0.html', data=data, headers=headers)
html = etree.HTML(s.text)
result_title = html.xpath('/html/body/div[1]/div[4]/div[2]/div[2]/ul/li[.]/h3/a/text()') #//代表获取子孙节点,*代表获取所有
result_url = html.xpath('/html/body/div[1]/div[4]/div[2]/div[2]/ul/li[.]/h3/a/@href') #//代表获取子孙节点,*代表获取所有
yx_page = html.xpath('/html/body/div[1]/div[4]/div[2]/div[3]/nav/ul/li[.]/a/text()') # 游戏页数
yx_page.pop()
num = yx_page[len(yx_page)-1]
pageinfo(num)
这便是爬取后的数据,存入到数据库中的
此代码仅供学习。