传统方法爬虫、协程爬虫、多线程爬虫

# 协程爬虫---奕聪软件
import aiohttp
import asyncio
import async_timeout
import time
from lxml import etree

sem = asyncio.Semaphore(10)


async def fetch(url, page, session):
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.99 Safari/537.36",
    }
    global sem
    async with sem:
        with async_timeout.timeout(10):
            async with session.get(url=url, headers=headers) as response:
                try:
                    return await response.text()
                except asyncio.TimeoutError:
                    with open("./TimeoutError.json", "a", encoding="utf-8")as f:
                        f.write(str(page) + "\n")


def parse(text, page):
    content_list = []
    html = etree.HTML(text)
    div_Listtitle = html.xpath("//div[@id='WebMainBody']/div[@class='Listtitle']")
    div_newcontent = html.xpath("//div[@id='WebMainBody']/div[@class='newcontent']")
    if div_Listtitle and div_newcontent and len(div_Listtitle) == len(div_newcontent):
        for index, item_Listtitle in enumerate(div_Listtitle):
            item_Listtitle = item_Listtitle.xpath("./a/text()")[0]
            item_newcontent = div_newcontent[index].xpath("./text()")[0]
            content_list.append((item_Listtitle, item_newcontent))
    else:
        with open("./error.json", "a", encoding="utf-8")as f:
            f.write(str(page) + "\n")
    print("Success:", page)
    return content_list


async def store(file, content_list):
    with open(file, "a", encoding="utf-8")as f:
        for content_item in content_list:
            f.write(str(content_item) + "\n")
#            print(content_item)


async def init(url, page, session):
    text = await fetch(url, page, session)
    content_list = parse(text, page)
    await store("./data_test.txt", content_list)


async def main(end=1000, start=1, delay=1):
    base_url = "http://wap.xaecong.com/zhuzuo.asp?"
    session = aiohttp.ClientSession()
    tasks = [asyncio.ensure_future(init(base_url + "page=" + str(page), page, session)) for page in
             range(start, end, delay)]
    await asyncio.wait(tasks)
    await session.close()


if __name__ == '__main__':
    now = lambda: time.time()
    start_time = now()
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())
    print("last time in {}".format(now() - start_time))

# 多线程爬虫-----奕聪软件
import time
from lxml import etree
import json
import requests
import threading


def fetch(session, url, page):
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0",
    }

    response = session.get(url=url, headers=headers, timeout=10)
    # print(response.content.decode("utf-8"))
    return response.content.decode("utf-8")


def parse(text, page):
    content_list = []
    html = etree.HTML(text)
    div_Listtitle = html.xpath("//div[@id='WebMainBody']/div[@class='Listtitle']")
    div_newcontent = html.xpath("//div[@id='WebMainBody']/div[@class='newcontent']")
    if div_Listtitle and div_newcontent and len(div_Listtitle) == len(div_newcontent):
        for index, item_Listtitle in enumerate(div_Listtitle):
            item_Listtitle = item_Listtitle.xpath("./a/text()")[0]
            item_newcontent = div_newcontent[index].xpath("./text()")[0]
            content_list.append((item_Listtitle, item_newcontent))
    else:
         with open("./error.josn", "a", encoding="utf-8")as f:
            json.dump(str(page) + "\n", f)
    print("Success:", page)
    return content_list


def store(file, content_list):
    with open(file, "a", encoding="utf-8")as f:
        for content_item in content_list:
            f.write(str(content_item) + "\n")
#            print(content_item)


def main(page):
    base_url = "http://wap.xaecong.com/zhuzuo.asp?"
    with requests.Session() as session:
	    url = base_url + "page=" + str(page)
	    text = fetch(session, url, page)
	    content_list = parse(text, page)
	    store("./data_test1.txt", content_list)


if __name__ == '__main__':
    now = lambda: time.time()
    lis = [threading.Thread(target=main, args=(page,)) for page in range(1, 1000)]
    start_time = now()
    [t.start() for t in lis]
    [t.join() for t in lis]
    print("last time in {}".format(now() - start_time))
# 传统方法爬虫-----奕聪软件
import time
from lxml import etree
import json
import requests


def fetch(session, url, page):
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0",
    }

    response = session.get(url=url, headers=headers, timeout=10)
    # print(response.content.decode("utf-8"))
    return response.content.decode("utf-8")


def parse(text, page):
    content_list = []
    html = etree.HTML(text)
    div_Listtitle = html.xpath("//div[@id='WebMainBody']/div[@class='Listtitle']")
    div_newcontent = html.xpath("//div[@id='WebMainBody']/div[@class='newcontent']")
    if div_Listtitle and div_newcontent and len(div_Listtitle) == len(div_newcontent):
        for index, item_Listtitle in enumerate(div_Listtitle):
            item_Listtitle = item_Listtitle.xpath("./a/text()")[0]
            item_newcontent = div_newcontent[index].xpath("./text()")[0]
            content_list.append((item_Listtitle, item_newcontent))
    else:
         with open("./error.josn", "a", encoding="utf-8")as f:
            json.dump(str(page) + "\n", f)
    print("Success:", page)
    return content_list


def store(file, content_list):
    with open(file, "a", encoding="utf-8")as f:
        for content_item in content_list:
            f.write(str(content_item) + "\n")
#            print(content_item)


def main():
    base_url = "http://wap.xaecong.com/zhuzuo.asp?"
    with requests.Session() as session:
        for page in range(1, 1000):
            url = base_url + "page=" + str(page)
            text = fetch(session, url, page)
            content_list = parse(text, page)
            store("./data_test1.txt", content_list)


if __name__ == '__main__':
    now = lambda: time.time()
    start_time = now()
    main()
    print("last time in {}".format(now() - start_time))
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值