百度图片下载器

给大家分享一个自己写的程序,百度图片下载器

介绍:

​ 百度图片下载助手,能够个性化爬取百度图片上的图片。

开发环境:

​ Pycharm(python3.8),第三方库:requests、selenium、multiprocessing、json

使用说明:

共做了两个方案:

(1)直接分析解码百度图片后台AJAX请求,直接爬取图片。这种方法虽简单,但容易被百度反爬机制发现,不太稳定。

(2)通过Selenium模拟浏览器操作来避免百度反爬,这种方法比较稳定。此外,通过多进程提高了效率。

Show Me The Code :

​ 版本1:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: Payiz
# @project: 百度图片下载器
# @File : 下载器1.0.py
# @Time : 2020/12/5 05:35:22
# 需要安装第三方库?试试下面语句(使用清华镜像源)
# pip --default-timeout=100 install -i https://pypi.tuna.tsinghua.edu.cn/simple name
# -----------------------------------

import requests
import re
import time
import os
import urllib.parse
import json

# 每次json出的数据量
page_num = 30
photo_dir = './下载结果/' # path 地址


def getDetailImage(word, width, height):
num = 0
url = 'https://image.baidu.com/search/acjson?'
string_parameters = 'tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={0}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=&copyright=&word={0}&s=&se=&tab=&width={2}&height={3}&face=&istype=&qc=&nc=1&fr=&expermode=&nojc=&pn={1}&rn=' + str(
page_num) + '&gsm=3c&1627926209='
url = url + string_parameters
# 页数
while num < 3:
page_url = url.format(urllib.parse.quote(word), num * page_num, width, height)
print(page_url)
# response = requests.get(page_url) # , allow_redirects=False)

headers = {
'Accept': 'text / plain, * / *;q = 0.01',
'Accept - Encoding': 'gzip, deflate, br',
'Accept - Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep - alive',
'Host': 'image.baidu.com',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/siqned-exchange;v=b3;q=0.9'
}

sessions = requests.session()
sessions.headers = headers
response = sessions.get(url, allow_redirects=True)

regex = re.compile(r'\\(?![/u"])')
try:
json_data = json.loads(regex.sub(r"\\\\", response.text)) # 转义nbjk jkkjjhkjh

for item in json_data['data']:
URL = item['objURL']
type = item['type']
pic_url = baidtu_uncomplie(URL)
print(pic_url)
html = requests.get(pic_url, timeout=5)

with open(word_dir2 + '/' + str(time.time()).replace('.', '1') + '.' + type, 'wb')as f:
f.write(html.content)
except Exception as e:
print('出错了:', e)
pass

num = num + 1
time.sleep(10)


# 解密
def baidtu_uncomplie(url):
res = ''
c = ['_z2C$q', '_z&e3B', 'AzdH3F']
d = {'w': 'a', 'k': 'b', 'v': 'c', '1': 'd', 'j': 'e', 'u': 'f', '2': 'g', 'i': 'h', 't': 'i', '3': 'j', 'h': 'k',
's': 'l', '4': 'm', 'g': 'n', '5': 'o', 'r': 'p', 'q': 'q', '6': 'r', 'f': 's', 'p': 't', '7': 'u', 'e': 'v',
'o': 'w', '8': '1', 'd': '2', 'n': '3', '9': '4', 'c': '5', 'm': '6', '0': '7', 'b': '8', 'l': '9', 'a': '0',
'_z2C$q': ':', '_z&e3B': '.', 'AzdH3F': '/'}
if (url == None or 'http' in url):
return url
else:
j = url
for m in c:
j = j.replace(m, d[m])
for char in j:
if re.match('^[a-w\d]+$', char):
char = d[char]
res = res + char
return res


if __name__ == "__main__":
# 用户交互,获取输入
words = input("请输入需要查找的关键词:若多条,可用空格分割\n").split(' ')
print(words)
for word in words:
word_dir = os.path.join(photo_dir, word)
if not os.path.exists(word_dir):
os.makedirs(word_dir)
widths = ['1920', ]
heights = ['1080', ]
for width, height in zip(widths, heights):
word_dir2 = word_dir + '/' + (width + 'x' + height)
if not os.path.exists(word_dir2):
os.makedirs(word_dir2)
getDetailImage(word, width, height)

​ 版本2:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: Payiz
# @project: 百度图片下载器
# @File : 下载器2.0.py
# @Time : 2020/12/5 05:03:33
# 需要安装第三方库?试试下面语句(使用清华镜像源)
# pip --default-timeout=100 install -i https://pypi.tuna.tsinghua.edu.cn/simple name
# -----------------------------------


import csv
import multiprocessing
import os
import random
import re
import time

import requests
import requests.adapters
# from 付费ip import zhiliu
# from Report_to_wechat.report import send_report
from selenium import webdriver # 用来驱动浏览器的
from selenium.webdriver.chrome.options import Options # 浏览器设置
# from selenium.webdriver import ActionChains # 破解滑动验证码的时候用的 可以拖动图片
from selenium.webdriver.common.action_chains import ActionChains # 鼠标悬停
from selenium.webdriver.common.by import By # 定位器 按照什么方式查找,By.ID,By.CSS_SELECTOR
# 判断器 和下面WebDriverWait一起用的
from selenium.webdriver.support import expected_conditions as ec
# 选择器,两种方法任选其一,都是指向同一个文件
# from selenium.webdriver.support.ui import Select
# 键盘操作
from selenium.webdriver.support.wait import WebDriverWait # 浏览器等待对象,等待页面加载某些元素

# 读文件,建立用户请求头
all_user_agents = []
with open("./请求头.txt", 'r', encoding='utf-8') as f:
for i in f:
all_user_agents.append(i.replace("\n", '').replace('\r', ''))


# 获取一个图片
def get_baidu_pic(keyword, select_size=1, pages=1):
#########################################################################
# 接口说明:
# keyword 搜索的图片关键词
# select_size 筛选尺寸:1 全部尺寸 2 特大 3 大 4 中 5 小
# pages 爬取页数:一般1页含20-50个图片
#########################################################################

# 目标网址
url = 'https://image.baidu.com/'

# 0.建立浏览器对象,并设置参数
chrome_options = Options()
# 关于窗口
chrome_options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
# chrome_options.add_argument("--start-maximized") # 窗口最大化
# chrome_options.add_argument('window-size=100x100') # 自定义窗口大小
chrome_options.add_argument('--headless') # 后台运行
# 关于页面加载
chrome_options.add_argument('--no-sandbox') # 禁止消息框
chrome_options.add_argument("--disable-infobars") # 禁止警告语
chrome_options.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面
chrome_options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度
# 另一种不加载图片的方法
# prefs = {"profile.managed_default_content_settings.images": 2}
# chrome_options.add_experimental_option("prefs", prefs)
# 增加扩展
# chrome_options.add_extension(extension_path)
# chrome_options.add_argument("--disable-dev-shm-usage") # 这个不知道是什么
# 设置中文
chrome_options.add_argument('lang=zh_CN.UTF-8')

# 添加请求头和代理ip
# global ua
# global all_ips
# ip = random.choice(all_ips)
# chrome_options.add_argument('user-agent=%s'%ua.random())
# chrome_options.add_argument('--proxy-server='+ip)
global all_user_agents
chrome_options.add_argument('user-agent={}'.format(random.choice(all_user_agents)))
# chrome_options.add_argument('Authorization="token 5325ebf07a0cbebf60780f4ebd016e7b4023065f"')
# chrome_options.add_argument('Referer="https://statistic.hackathon2020eastchina.top/"')

# 创建浏览器对象
browser = None
try:
browser = webdriver.Chrome(options=chrome_options)
except Exception as message:
print('创建浏览器对象失败,错误代码:', message)

result = [] # 结果列表
try:
# 1.打开网页
browser.get(url)
time.sleep(0.2)
# 2.输入需要查找的图片名
wait = WebDriverWait(browser, 10, 0.1)
wait.until(ec.presence_of_all_elements_located((By.ID, "kw")))
inputs = browser.find_element_by_id("kw")
inputs.send_keys(keyword)

# 4.输入完信息后,执行查询操作
# inputs.send_keys(Keys.ENTER) # '按下'回车键,这个网站此方法不行,会跳转到别的网站
browser.find_element_by_class_name("s_newBtn").click() # 点击'查询'

# 5. 开始加载网页获取目标图片
try:
# 筛选尺寸
# 等待加载
wait = WebDriverWait(browser, 10, 0.1)
wait.until(ec.presence_of_all_elements_located((By.CLASS_NAME, "sizeFilter")))
# 鼠标悬停
element = browser.find_element_by_class_name('sizeFilter')
actions = ActionChains(browser)
actions.move_to_element(element).perform()
# 选择尺寸,并点击
time.sleep(0.3)
choose_size = browser.find_element_by_xpath(
'/html/body/div[1]/div[4]/div[2]/div/div[2]/div/div[2]/ul/li[{}]'.format(str(select_size)))
choose_size.click()
# 加载图片
for one_page in range(1, pages + 1):
# 获取该页
wait = WebDriverWait(browser, 10, 0.1)
wait.until(ec.presence_of_all_elements_located((By.ID, "imgid")))
# time.sleep(0.5)
# 等待该页中所有图片加载完毕
page_item = '//*[@id="imgid"]/div[{}]/ul'.format(str(one_page))
wait = WebDriverWait(browser, 10, 0.1)
# 取得该页所有图片
wait.until(ec.presence_of_all_elements_located((By.XPATH, page_item)))
imgitems = browser.find_element_by_xpath(page_item).find_elements_by_class_name("imgitem")
# print("第{}页共有{}张图片:".format(one_page, len(imgitems)))
for i in imgitems:
title = i.get_attribute('data-title').replace('<strong>', '').replace('</strong>', '')
pic_url = i.get_attribute('data-objurl')
# print(title, pic_url)
result.append({"title": title, "pic_url": pic_url})
# time.sleep(0.3)
browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') # 滑到底部

except Exception as message:
print('定位报错:', message)
finally:
# source = browser.page_source
# content = parsel.Selector(source)
# print('打开成功:', keyword)
# # print(source)
pass

except Exception as e:
print('打开失败:', keyword)
print('打开网页失败,网址:{} 错误代码:'.format(url), e)
finally:
# 等待几秒
# time.sleep(10)
# 关闭浏览器对象
browser.close()
return result



def error_handler(e):
print('多进程出错啦!')
print(dir(e), "\n")
print("-->{}<--".format(e.__cause__))

# 下载一个图片
def down_load_one_pic(file_path, name, url):
def validateTitle(title):
rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
new_title = re.sub(rstr, "_", title) # 替换为下划线
return new_title

try:
# 设置重连次数
requests.adapters.DEFAULT_RETRIES = 5
ses = requests.Session()
# 设置连接活跃状态为False
ses.keep_alive = False
global all_user_agents
head = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,'
'*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
# 'Cache-Control': 'max-age=0',
# 'Connection': 'keep-alive',
# 'If-Modified-Since': 'Thu, 24 Jan 2019 11:06:45 GMT',
# 'If-None-Match': '56de0e5b196e494a8dfbe1f638eefd7e',
# 'Upgrade-Insecure-Requests': '1',
'user-agent': random.choice(all_user_agents)
}

res = requests.get(url, headers=head, timeout=10)
# print("正在访问:", name, url)
if res.status_code == 200 or res.status_code == 403:
# print(res.text)
if not os.path.exists(file_path):
os.makedirs(file_path)
ss = name if len(name) < 30 else name[0:30]
ss = validateTitle(ss)
path = file_path + '/' + ss + '.png'
with open(path, 'wb') as f:
f.write(res.content)
# # with open(file_path, 'a+', newline='', encoding='utf-8-sig')as f:
# writer = csv.writer(f)
# writer.writerow([prox, origin_ip, where, datetime.datetime.now()])
print("照片", ss, "下载成功")
else:
print('\n!!! 请求成功,但网页返回错误代码:{},网址{}'.format(str(res.status_code), url))
# 关闭请求 释放内存
res.close()
del res
except Exception as e:
print("\n!!! 请求超失败,错误代码:{},网址{}".format(e, url))
time.sleep(5)


if __name__ == '__main__':
start = time.time()
# 用户交互,获取输入
keyword = input("请输入需要查找的关键词:若多条,可用空格分割\n").split(' ')
print(keyword)
size = int(input("请输入尺寸:如 1 全部尺寸 2 特大 3 大 4 中 5 小 ,默认1\n"))
count = int(input("请输入数量:默认:20\n"))
# file_path = input("请输入下载地址\n")
file_path = "./下载结果/"
# 处理交互数据
if keyword == '':
keyword = "404" # 哈哈哈
if size not in [1, 2, 3, 4, 5]:
size = 1
if not 0 < count < 1000:
count = 20
pages = count // 30 + 1
# 获取图片
print("开始获取图片,请稍等💖💖💖")
all_keywords = []
for kw in keyword:
print("正在获取关键词: ", kw," 相关的图片...")
get_one = {}
get_one['key_word'] = kw
get_one['file_path'] = file_path + kw
get_one['all_pic'] = get_baidu_pic(kw, size, pages)[0:len(keyword)*count]
all_keywords.append(get_one)

print("开始下载啦!😘😘😘")
# 测试:单进程下载
# for i in all_keywords:
# print("正在下载:", i['key_word'])
# for j in i['all_pic']:
# down_load_one_pic(i['file_path'], j['title'], j['pic_url'])
# # break
# # break

# 多进程下载
params = []
for one in all_keywords:
for one_pic in one['all_pic']:
params.append((one['file_path'], one_pic['title'], one_pic['pic_url']))

pool = multiprocessing.Pool(processes=32)
res = pool.starmap_async(func=down_load_one_pic, iterable=params, error_callback=error_handler)
pool.close()
pool.join()

print('\n------------------------------------------------')
end = time.time()
print('程序结束,总共耗时:', end - start, '秒')
print('\n欢迎下次再来!❤')

程序运行展示:

FAQ

​ 推荐使用版本2,要确保Selenium库以及chromedriver安装,不会的可以参考这里

项目源代码

​ 本项目已开源,项目地址。欢迎访问我的Github了解更多有意思的项目!

打赏
  • 版权声明: 本博客所有文章除特别声明外,著作权归作者所有。转载请注明出处!
  • Copyrights © 2019-2022 PAYIZ
  • |

感谢您的支持😊

支付宝
微信