一个被无数人爬取的网站,桌面壁纸网站,尤其是某个分类板块,堪称是LSP的最爱,各种小姐姐壁纸合集,最重要的类型无数,能让你收获满满,就是本渣渣农村人有点营养跟不上了啊!
这就有了下面的文章,异步爬虫,Python美女图异步爬虫案例小姐姐我全都要!
单线程,异步,多线程,本渣渣都粘贴复制上了,给各位大佬都安排上了,后台回复“小姐姐”即可获取源码,复制粘贴可用,什么文档,原理,本渣渣统统不会,直接 ctrl+c , ctrl+v 开搞!
目标网站:彼岸桌面
网址:http://www.netbian.com/meinv/
本渣渣仅仅爬取了前三页,如果有兴趣可自行调试,安排上!
当然也可以更换其他分类,同样也没有问题哈!
单线程运行效果
单线程爬取效果
单线程爬取时间
异步+多线程下载图片时间
异步核心源码:
async def get_content(self, url): async with aiohttp.ClientSession() as session: response = await session.get(url, headers=self.headers, timeout=5) content = await response.read() return content async def get_parse_urls(self, url): content = await self.get_content(url) html = content.decode('gbk') req = etree.HTML(html) hrefs = req.xpath('//div[@class="list"]/ul/li/a/@href') print(len(hrefs)) self.pare_urls.extend(hrefs) def list_run(self): urls_tasks = [] loop = asyncio.get_event_loop() for i in range(1, self.page_max + 1): print(f'>> 正在爬取第{i}页列表链接..') if i == 1: url = f'{self.url}{self.category}/' else: url = f'{self.url}{self.category}/index_{i}.htm' c = self.get_parse_urls(url) # 通过返回的协程对象进一步封装成一个任务对象 urls_task = asyncio.ensure_future(c) urls_tasks.append(urls_task) loop.run_until_complete(asyncio.wait(urls_tasks)) print(len(self.pare_urls)) async def get_parse(self, url): content = await self.get_content(url) html = content.decode('gbk') req = etree.HTML(html) img_url = req.xpath('//div[@class="pic"]/p/a/img/@src')[0] img_name = req.xpath('//div[@class="pic"]/p/a/img/@alt')[0] print(img_url, img_name) data = img_url, img_name self.datas.append(data)
多线程采集图片时间
多线程核心源码参考:
def down_run(self): print(f'>> 正在启动多线程下载图片..') try: # 开4个 worker,没有参数时默认是 cpu 的核心数 pool = ThreadPool() results = pool.map(self.down, self.datas) pool.close() pool.join() print("下载所有图片完成!") except: print("Error: unable to start thread") def get_urllist(self,url): html=requests.get(url,headers=self.headers).content.decode('gbk') req=etree.HTML(html) hrefs=req.xpath('//div[@class="list"]/ul/li/a/@href') print(len(hrefs)) threadings = [] for href in hrefs: href=f'{self.url}{href}' print(f'>> 正在爬取 {href}..') t=threading.Thread(target=self.parse,args=(href,)) threadings.append(t) t.start() for x in threadings: x.join() print("多线程采集该页图片完成!")
单线程爬取源码参考:
# -*- coding=utf-8 -*- # 彼岸桌面图片采集 # @author 微信:huguo00289 # @微信公众号:二爷记 import requests, os,time from lxml import etree from fake_useragent import UserAgent class Net(object): def __init__(self, category="meinv", page_max=3): self.ua = UserAgent() self.headers = {'User-Agent': self.ua.random} self.url = "http://www.netbian.com" self.category = category os.makedirs(f'{self.category}/', exist_ok=True) self.page_max = page_max def get_urllist(self,url): html=requests.get(url,headers=self.headers).content.decode('gbk') req=etree.HTML(html) hrefs=req.xpath('//div[@class="list"]/ul/li/a/@href') print(len(hrefs)) for href in hrefs: href=f'{self.url}{href}' print(f'>> 正在爬取 {href}..') self.parse(href) def parse(self,url): html = requests.get(url, headers=self.headers).content.decode('gbk') req = etree.HTML(html) img_url=req.xpath('//div[@class="pic"]/p/a/img/@src')[0] img_name=req.xpath('//div[@class="pic"]/p/a/img/@alt')[0] print(img_url,img_name) self.down(img_url,img_name) def down(self,img_url,img_name): r=requests.get(img_url,headers=self.headers,timeout=5) suffix=os.path.splitext(img_url)[-1] with open(f'{self.category}/{img_name}{suffix}','wb') as f: f.write(r.content) print(f'下载图片-{img_name}{suffix}成功!') def main(self): for page in range(1,int(self.page_max)+1): if page==1: url=f'{self.url}/{self.category}/' else: url=f'{self.url}/{self.category}/index_{page}.htm' print(f'>> 正在爬取第{page}页数据..') self.get_urllist(url) if __name__ == '__main__': start = time.time() # 记录起始时间戳 spider = Net() spider.main() end = time.time() # 获取结束时间戳 print('共运行了{}秒'.format(end - start)) # 程序耗时
相信看了以上运行时间,你应该对于多线程,异步等爬取方式有了更清楚的了解!
关于异步、多线程爬虫,本渣渣已经整理好了demo,相信对于初学的你应该有用,老鸟就不必获取了,写的比较渣,见谅!
完整项目获取
请关注微信公众号:二爷记
后台回复关键词“小姐姐”