python 爬虫输出为空_爬虫输出文件为空,求解答

本文探讨了一个Python爬虫程序在抓取数据并写入csv文件时出现输出为空的问题。通过检查代码,发现可能存在的问题包括迭代设置错误和函数错误。使用了requests库进行网页请求,lxml库解析HTML,以及多线程处理数据。问题可能出现在数据提取、文件写入或并发处理部分,建议检查xpath表达式、文件打开模式以及多线程同步。
摘要由CSDN通过智能技术生成

import json

import requests

import csv

from multiprocessing.dummy import Pool as ThreadPool

import time

import random

from lxml import etree

def spider(page):

url = 'http://waimai.meituan.com/search/wx4g19983su8/rt?keyword='+str(page)

headers ={

'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',

'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

'Accept-Encoding':'gzip, deflate, sdch',

'Accept-Language':'zh-CN,zh;q=0.8',

'Cache-Control':'max-age=0',

'Connection':'keep-alive',

'Cookie':'w_uuid=Lk80hpGbK2WdGpo3knW8qZw2M6dDGUODsDrcGIPj8TdyhDChRFTV-fjxkGWwEQ9w; _lxsdk=15d9af78c28c8-0ee719fb4f1d34-57e1b3c-144000-15d9af78c29c8; Hm_lvt_f66b37722f586a240d4621318a5a6ebe=1501555925,1501555942,1501629715; __utma=211559370.58204263.1500387690.1501555925.1501629715.2; __utmz=211559370.1501629715.2.2.utmcsr=baidu|utmccn=baidu|utmcmd=organic|utmcct=zt_search; __mta=45569435.1501629712300.1501629720518.1501629728116.3; uuid=9dd0ef628497b76a1925.1501543494.0.0.0; _lxsdk_cuid=15e4aa6effc9e-0b23fb07f9c1bf-57e1b3c-144000-15e4aa6effdc8; _ga=GA1.2.58204263.1500387690; _gid=GA1.2.1025663946.1504513189; w_cid=110101; w_cpy_cn="%E4%B8%9C%E5%9F%8E%E5%8C%BA"; w_cpy=dongchengqu; waddrname="%E6%9C%AA%E7%9F%A5"; w_geoid=wx4g19983su8; w_ah="39.91065189242363,116.43523581326008,%E6%9C%AA%E7%9F%A5|39.8989349976182,116.50381989777088,%E7%99%BE%E5%AD%90%E6%B9%BE|39.96550491079688,116.30504373461008,%E6%B5%B7%E6%B7%80%E5%8C%BA|40.004531890153885,116.47530399262905,%E6%9C%9B%E4%BA%AC%2C%E6%9D%A5%E5%B9%BF%E8%90%A5%2C%E8%8A%B1%E5%AE%B6%E5%9C%B0"; JSESSIONID=s1gvzkp9v2th1ed70m8qr5yyz; _ga=GA1.3.58204263.1500387690; _gid=GA1.3.1025663946.1504513189; _gat=1; w_utmz="utm_campaign=(direct)&utm_source=(direct)&utm_medium=(none)&utm_content=(none)&utm_term=(none)"; w_visitid=039dc5a0-4720-4cef-afc1-e43eeec3671a; __mta=45569435.1501629712300.1501629728116.1504518924133.4',

'Host':'waimai.meituan.com',

'Upgrade-Insecure-Requests':'1'

}

param ={

'keyword':str(page)

}

timeout = random.choice(range(10, 20))

html = requests.get(url,headers = headers,timeout = timeout,params = param)

selector = etree.HTML(html.text)

content_field = selector.xpath('//*[@class="result-content"]/ul/li')

item = {}

for each in content_field:

name = each.xpath('a/div[1]/p[1]/text()')

time = each.xpath('a/div[1]/p[4]/text()')

order = each.xpath('a/div[1]/p[3]/span[2]/text()')

item['name'] = "".join(name)

item['time'] = "".join(time)

item['order'] = "".join(order)

fieldnames = ['name', 'time', 'order']

with open('mtjd3.csv', 'a', newline='', errors='ignore')as f:

f_csv = csv.DictWriter(f, fieldnames=fieldnames)

f_csv.writeheader()

f_csv.writerow(item)

if __name__ == '__main__':

f = open('mtjd3.csv', 'w')

list = ["大虾来了","夹克的虾","簋街仔仔","辣私房","烧虾师"]

page = []

for i in list:

page.append(i)

print(page)

pool = ThreadPool(4)

results = pool.map(spider, page)

pool.close()

pool.join()

f.close()

运行输出文件为空,不知道是迭代设置错误还是函数错误?求帮助

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值