|
注册登录后全站资源免费查看下载
您需要 登录 才可以下载或查看,没有账号?立即注册
×
本帖最后由 开车司机 于 2022-8-17 11:34 编辑
======================================两种方式==原作者@zrq648022547 ==指定图集首页地址,爬取单图册图片 == ====================================
代码一:爬取指定美女图册
缺点:
1、虽然写入了多线程爬取,但是测试貌似还是比较慢;
2、爬取的图片名称不能按照序号命名
3、未处理反扒机制
- # 多进程异步并发
- import random
- import requests
- from bs4 import BeautifulSoup
- import os
- import time
- import threading
- from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
- import concurrent.futures
-
- USER_AGENTS = [
- "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
- "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
- "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
- "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
- "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
- "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
- "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
- "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
- "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
- "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
- "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
- "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
- "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
- "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
- "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
- "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
- "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
- "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
- "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
- "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
- "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
- "UCWEB7.0.2.37/28/999",
- "NOKIA5700/ UCWEB7.0.2.37/28/999",
- "Openwave/ UCWEB7.0.2.37/28/999",
- "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
- # iPhone 6:
- "Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",
- ]
-
- headers = {
- 'User-Agent': random.choice(USER_AGENTS),
- "referer": "https://www.xiurenji.vip/"
- }
-
-
- def get_item(item, main_title):
- title = item['title']
- threads = []
- item_links = [start_url + item['href']]
- # print(f'{title}>>>{item_links}')
- for item_link in item_links:
- threads.append(threading.Thread(target=get_images,args=(title, item_link, main_title)))
- for thread in threads:
- thread.start()
- for thread in threads:
- thread.join()
-
-
- def get_images(title, item_link, main_title):
- try:
- item_res = requests.get(url=item_link, headers=headers,timeout=30)
- item_res.encoding = 'gzip'
- item_soup = BeautifulSoup(item_res.text, 'lxml')
- img_list = item_soup.select('.content_left p img')
- # print(img_list)
- folder = main_folder + '/' + main_title + '/' + title + '/'
- if not os.path.exists(folder):
- os.makedirs(folder)
- else:
- pass
- try:
- for img in img_list:
- img_link = start_url + img['src']
- # print(img_link)
- with open(folder + img_link.split('/')[-1], 'wb') as f:
- starttime = time.time()
- image = requests.get(url=img_link, headers=headers,timeout=30).content
- f.write(image)
- time.sleep(1)
- endtime = time.time()
- print(f'正在保存>>>{title}' + '>>>' + img_link.split('/')[-1] + '>>>用时%.3f'%(endtime - starttime), 'seconds')
- except IndexError:
- pass
- next = item_soup.select_one('.page a:last-of-type')
- if 'class="current"' in str(next):
- pass
- else:
- item_link = start_url + next['href']
- with concurrent.futures.ThreadPoolExecutor(max_workers=3) as pool:
- pool.submit(get_images,title, item_link, main_title)
- except IndexError:
- pass
-
-
- if __name__ == '__main__':
- start = time.time()
- start_url = 'https://www.xrmn5.cc' # 网站根地址
- main_title = '秀人网'
- main_folder = "单项目2\" # 主文件夹路径(请填入你自己的文件夹路径)
- main_url = 'https://www.xrmn5.cc/younisi.html'
- itemlist_res = requests.get(url=main_url, headers=headers,timeout=30)
- itemlist_res.encoding = 'gzip'
- itemlist_soup = BeautifulSoup(itemlist_res.text, 'lxml')
- itemlist = itemlist_soup.select('.list_n2 a')
- # print(itemlist)
- for item in itemlist:
- get_item(item, main_title)
- end = time.time()
- print('总共用时:',end - start,'seconds',end='')
复制代码 代码二:爬取美女单图集
缺点:
1、文件夹名称需要手动指定
2、爬取的图册首页地址需要手动指定
3、爬取的图片和页面显示的图片数量不符合,有的页面图片漏掉了,不知道为啥
- import requests
- import parsel
- import random
- import os
- import datetime
- import time
-
-
- starttime = datetime.datetime.now()
- USER_AGENTS = [
- "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
- "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
- "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
- "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
- "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
- "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
- "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
- "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
- "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
- "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
- "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
- "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
- "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
- "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
- "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
- "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
- "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
- "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
- "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
- "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
- "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
- "UCWEB7.0.2.37/28/999",
- "NOKIA5700/ UCWEB7.0.2.37/28/999",
- "Openwave/ UCWEB7.0.2.37/28/999",
- "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
- # iPhone 6:
- "Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",
- ]
-
- headers = {
- 'User-Agent': random.choice(USER_AGENTS),
- # 'Connection': 'close'
- "referer":"https://www.xrmn5.cc"
- }
-
- # 0.创建文件夹
- directory = '[YouMi尤蜜荟]Vol.809_女神尤妮丝Egg红色轻透上衣配红短裙半脱露红色内衣诱惑写真60P\\'
- if os.path.exists(directory):
- pass
- else:
- os.mkdir(directory)
- # 1.确定爬取的网站
-
- for page in range(0, 30):
- try:
- if page >= 1:
- base_url = 'https://www.xrmn5.com/YouMi/2022/202211014_{}.html'.format(page)
- else:
- base_url = 'https://www.xrmn5.com/YouMi/2022/202211014.html'
- # print('==========正在爬取第{}页数据============='.format(page))
- # 2.发送请求
- response = requests.get(url=base_url, headers=headers)
- response.encoding = 'UTF-8' # 自动识别响应体的编码
- # print(response)
- html_data = response.text
- # print(html_data)
- # 解析详情页图片地址
- response_1 = requests.get(base_url, headers=headers).text
- html_1 = parsel.Selector(response_1)
- # print(html_1)
- # 解析图册中图片地址
- for i in range(1,4):
- img_list_1 = html_1.xpath('//*[@class="content_left"]/p/img[{}]/@src'.format(i)).extract_first()
- img_list = img_list_1.replace('uploadfile','Uploadfile')
- img_url = 'https://p.xrmn5.com/' + str(img_list)
- # print(img_list_1)
- # 请求图片地址
- img_data = requests.get(img_url, headers=headers).content
- img_name_1 = str(int(page) + 1) # 图片文件名称
- # print(img_name_1)
- # 4.数据保存
- with open(directory + img_name_1 + '-' + str(i) + '.jpg', 'wb') as f:
- f.write(img_data)
- time.sleep(10)
- print('#####################正在爬取第', page + 1, '页,第', int(i),'张图片#####################')
- except IndexError:
- continue
- endtime = datetime.datetime.now()
- print('####################下载完成,共用时', (endtime - starttime).seconds, '秒###################')
复制代码
|
|