爬取斗图网案例

本篇以爬取斗图网为案例,分别有单线程爬取斗图网(耗时142秒),多线程爬取斗图网(耗时8秒),以及异步爬取斗图网(不足3秒)。

单线程爬取斗图王

  • 单线程版本
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import requests
from lxml import etree
import os
import time


def get_img_urls():
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Mobile Safari/537.36'
}

res = requests.get("https://www.pkdoutu.com/photo/list/", headers=headers, verify=False)

selector = etree.HTML(res.text)
img_urls = selector.xpath('//*[@id="pic-detail"]/div/div[2]/div[2]/ul/li/div/div/a/img[@data-backup]/@data-backup')

return img_urls


# 根据img_urls下载
def download_img(url):
res = requests.get(url)
# img_name = url.split("/")[-1]
img_name = os.path.basename(url)
path = os.path.join("imgs", img_name)

if not os.path.exists("imgs"):
os.mkdir("imgs")

with open(path, "wb") as f:
for i in res.iter_content():
f.write(i)
print(f"{img_name}下载完成")

start = time.time()

img_urls = get_img_urls()

for img_url in img_urls:
download_img(img_url)

print(f"整体耗时{time.time() - start}秒")

会有一个警告,这个警告对于后面的代码逻辑没有影响

requests.get(“https://www.pkdoutu.com/photo/list/", headers=headers, verify=False)

这是因为verify关键字设置了False,参数这是因为我们没有配值数字证书,会提示通信不安全

  • 加入如下代码可以忽略警告
1
2
import urllib3
urllib3.disable_warnings()

多线程版本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import requests
from lxml import etree
import os
import time

import threading


def get_img_urls():
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Mobile Safari/537.36'
}

res = requests.get("https://www.pkdoutu.com/photo/list/", headers=headers)

# print(res.text)

selector = etree.HTML(res.text)
img_urls = selector.xpath('//*[@id="pic-detail"]/div/div[2]/div[2]/ul/li/div/div/a/img[@data-backup]/@data-backup')

return img_urls


# 根据img_urls下载
def download_img(url):
res = requests.get(url)
# img_name = url.split("/")[-1]
img_name = os.path.basename(url)
path = os.path.join("imgs", img_name)

if not os.path.exists("imgs"):
os.mkdir("imgs")

with open(path, "wb") as f:
for i in res.iter_content():
f.write(i)
print(f"{img_name}下载完成")


start = time.time()

img_urls = get_img_urls()

t_list = []

for img_url in img_urls:
t = threading.Thread(target=download_img, args=(img_url,))
t.start()
t_list.append(t)

for t in t_list:
t.join()

print(f"整体耗时{time.time() - start}秒")

异步版本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import requests
from lxml import etree
import os
import time
import asyncio
import aiohttp

headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Mobile Safari/537.36'
}


async def get_img_urls():

async with aiohttp.ClientSession() as session:
async with session.get("https://www.pkdoutu.com/photo/list/", headers=headers, ssl=False) as response:
selector = etree.HTML(await response.text())
img_urls = selector.xpath('//*[@id="pic-detail"]/div/div[2]/div[2]/ul/li/div/div/a/img[@data-backup]/@data-backup')

return img_urls


# 根据img_urls下载
async def download_img(url):

async with aiohttp.ClientSession() as session:
async with session.get(url, ssl=False) as response:
img_name = os.path.basename(url)
path = os.path.join("imgs", img_name)

if not os.path.exists("imgs"):
os.mkdir("imgs")

with open(path, "wb") as f:
f.write(await response.content.read())

print(f"{img_name}下载完成")


async def main():
# 爬取所有img_url
img_urls = await get_img_urls()

# 根据url下载所有图片
tasks = [asyncio.create_task(download_img(url)) for url in img_urls]

await asyncio.wait(tasks)

# for img_url in img_urls:
# download_img(img_url)


# 主逻辑
start = time.time()

asyncio.run(main())


print(f"整体耗时{time.time() - start}秒")