import aiohttp
import asyncio
import async_timeout
import time
from lxml import etree
sem = asyncio.Semaphore(10)
async def fetch(url, page, session):
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.99 Safari/537.36",
}
global sem
async with sem:
with async_timeout.timeout(10):
async with session.get(url=url, headers=headers) as response:
try:
return await response.text()
except asyncio.TimeoutError:
with open("./TimeoutError.json", "a", encoding="utf-8")as f:
f.write(str(page) + "\n")
def parse(text, page):
content_list = []
html = etree.HTML(text)
div_Listtitle = html.xpath("//div[@id='WebMainBody']/div[@class='Listtitle']")
div_newcontent = html.xpath("//div[@id='WebMainBody']/div[@class='newcontent']")
if div_Listtitle and div_newcontent and len(div_Listtitle) == len(div_newcontent):
for index, item_Listtitle in enumerate(div_Listtitle):
item_Listtitle = item_Listtitle.xpath("./a/text()")[0]
item_newcontent = div_newcontent[index].xpath("./text()")[0]
content_list.append((item_Listtitle, item_newcontent))
else:
with open("./error.json", "a", encoding="utf-8")as f:
f.write(str(page) + "\n")
print("Success:", page)
return content_list
async def store(file, content_list):
with open(file, "a", encoding="utf-8")as f:
for content_item in content_list:
f.write(str(content_item) + "\n")
async def init(url, page, session):
text = await fetch(url, page, session)
content_list = parse(text, page)
await store("./data_test.txt", content_list)
async def main(end=1000, start=1, delay=1):
base_url = "http://wap.xaecong.com/zhuzuo.asp?"
session = aiohttp.ClientSession()
tasks = [asyncio.ensure_future(init(base_url + "page=" + str(page), page, session)) for page in
range(start, end, delay)]
await asyncio.wait(tasks)
await session.close()
if __name__ == '__main__':
now = lambda: time.time()
start_time = now()
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
print("last time in {}".format(now() - start_time))
import time
from lxml import etree
import json
import requests
import threading
def fetch(session, url, page):
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0",
}
response = session.get(url=url, headers=headers, timeout=10)
return response.content.decode("utf-8")
def parse(text, page):
content_list = []
html = etree.HTML(text)
div_Listtitle = html.xpath("//div[@id='WebMainBody']/div[@class='Listtitle']")
div_newcontent = html.xpath("//div[@id='WebMainBody']/div[@class='newcontent']")
if div_Listtitle and div_newcontent and len(div_Listtitle) == len(div_newcontent):
for index, item_Listtitle in enumerate(div_Listtitle):
item_Listtitle = item_Listtitle.xpath("./a/text()")[0]
item_newcontent = div_newcontent[index].xpath("./text()")[0]
content_list.append((item_Listtitle, item_newcontent))
else:
with open("./error.josn", "a", encoding="utf-8")as f:
json.dump(str(page) + "\n", f)
print("Success:", page)
return content_list
def store(file, content_list):
with open(file, "a", encoding="utf-8")as f:
for content_item in content_list:
f.write(str(content_item) + "\n")
def main(page):
base_url = "http://wap.xaecong.com/zhuzuo.asp?"
with requests.Session() as session:
url = base_url + "page=" + str(page)
text = fetch(session, url, page)
content_list = parse(text, page)
store("./data_test1.txt", content_list)
if __name__ == '__main__':
now = lambda: time.time()
lis = [threading.Thread(target=main, args=(page,)) for page in range(1, 1000)]
start_time = now()
[t.start() for t in lis]
[t.join() for t in lis]
print("last time in {}".format(now() - start_time))
import time
from lxml import etree
import json
import requests
def fetch(session, url, page):
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0",
}
response = session.get(url=url, headers=headers, timeout=10)
return response.content.decode("utf-8")
def parse(text, page):
content_list = []
html = etree.HTML(text)
div_Listtitle = html.xpath("//div[@id='WebMainBody']/div[@class='Listtitle']")
div_newcontent = html.xpath("//div[@id='WebMainBody']/div[@class='newcontent']")
if div_Listtitle and div_newcontent and len(div_Listtitle) == len(div_newcontent):
for index, item_Listtitle in enumerate(div_Listtitle):
item_Listtitle = item_Listtitle.xpath("./a/text()")[0]
item_newcontent = div_newcontent[index].xpath("./text()")[0]
content_list.append((item_Listtitle, item_newcontent))
else:
with open("./error.josn", "a", encoding="utf-8")as f:
json.dump(str(page) + "\n", f)
print("Success:", page)
return content_list
def store(file, content_list):
with open(file, "a", encoding="utf-8")as f:
for content_item in content_list:
f.write(str(content_item) + "\n")
def main():
base_url = "http://wap.xaecong.com/zhuzuo.asp?"
with requests.Session() as session:
for page in range(1, 1000):
url = base_url + "page=" + str(page)
text = fetch(session, url, page)
content_list = parse(text, page)
store("./data_test1.txt", content_list)
if __name__ == '__main__':
now = lambda: time.time()
start_time = now()
main()
print("last time in {}".format(now() - start_time))