def parser_seputu(): # user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, " \ # "like Gecko) Chrome/68.0.3440.84 Safari/537.36" user_agent = "Mozilla/4.0 (compatible; MSIE 5.5; windows NT" headers = {"User-Agent": user_agent} r = requests.get("http://seputu.com", headers=headers) r.encoding = r.apparent_encoding soup = BeautifulSoup(r.text, "html.parser") content = {"href": [], "title": []} for mulu in soup.find_all(class_="box"): for x in mulu.find_all("a"): content["href"].append(x.get("href")) content["title"].append(x.get("title")) # print(x.get("title").split()[0] + " " + x.get("title").split()[1]) # f = open("xpath.html", "w", encoding="utf-8") # f.write(r.text) return content def parse_path(url): user_agent = "Mozilla/4.0 (compatible; MSIE 5.5; windows NT" headers = {"User-Agent": user_agent} r = requests.get(url="http://seputu.com", headers=headers) r.encoding = r.apparent_encoding html = etree.HTML(r.text) div_boxs = html.xpath("//div[@class='box']") for div_box in div_boxs: for div_a in div_box.xpath("./ul/li/a/@href"): print(div_a) for div_a in div_box.xpath("./ul/li/a/@title"): print(div_a)
利用xpath和beautifulsoup爬取简单网站
最新推荐文章于 2022-07-31 07:00:47 发布