1.导入依赖包
import re
import os
import time
import random
import requests
from lxml import etree
2.获取url
def get_url(url):
# 伪造请求头
headers = {
'user-agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWeb'
'Kit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36')
}
# 请求网页
res = requests.get(url, headers=headers)
#设置休眠时间
time.sleep(random.randint(1, 3))
res.encoding = 'gbk'
html = etree.HTML(res.text)
page = html.xpath('//dd[@class="page"]/a/@href')[-1]
page = page.split('.')[0]
page = page.split('_')[-1]
link = html.xpath('//dd[@class="page"]/a/@href')[0]
link = link.split('.')[0]
link = link[:-1]
link_list = [url]
for i in range(2, int(page) + 1):
u = url + link + str(i) + '.html'
link_list.append(u)
for u in link_list:
parse_url(u)
3.解析url
def parse_url(u):
headers = {
'user-agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWeb'
'Kit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36')
}
res = requests.get(u, headers=headers)
#设置休眠时间
time.sleep(random.randint(1, 3))
res.encoding = 'gbk'
res = etree.HTML(res.text)
nd=res.xpath('//dl[@class="list-left public-box"]/dd/a[@target="_blank"]/@href')
names = res.xpath('//dl[@class="list-left public-box"]/dd/a[@tar'
'get="_blank"]/img/@alt')
for name, node in zip(names, nd):
path = 'mnfl/' + name + '1.jpg'
print(path)
if os.access(path, os.F_OK):
print("图片已经存在")
pass
else:
print(node)
get_pic(node)
4.获取图片
def get_pic(node):
headers = {
'user-agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWeb'
'Kit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36')
}
# print(link)
res = requests.get(node, headers=headers)
#设置休眠时间
time.sleep(random.randint(1, 3))
res.encoding = 'gbk'
page = re.findall('>共(.*?)页<', res.text)[0]
res = etree.HTML(res.text)
img_link = res.xpath('//div/a/img/@src')[0]
name = res.xpath('//div[@class="content"]/h5/text()')[0]
base_url = img_link.split('/')
url = base_url[:-1]
url = '/'.join(url)
for i in range(1, int(page) + 1):
img_link = url + '/' + str(i) + '.jpg'
save_img(node, img_link, name, i)
5.保存图片
def save_img(link, img_link, name, i):
headers = {
'Referer': link,
'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWeb'
'Kit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36')
}
img = requests.get(img_link, headers=headers)
#设置休眠时间
time.sleep(random.randint(1, 3))
if not os.path.exists('mnfl'):
os.mkdir('mnfl')
path = 'mnfl/' + name + str(i) + '.jpg'
print(path)
if os.access(path, os.F_OK):
print("图片已经存在")
else:
try:
with open(path, 'wb') as f:
f.write(img.content)
print('图片存储成功')
except:
print('图片存储失败')
def main():
# 想要哪个分类直接把'qingchun'改为其他类即可
# url = 'https://www.mm131.net/qingchun/'
# get_url(url)
# 如果要爬取所有的类,注释上面两行,运行下面代码
url_list = ['xinggan', 'qingchun', 'xiaohua', 'chemo', 'qipao', 'mingxing']
for i in url_list:
url = 'https://www.mm131.net/' + i + '/'
get_url(url)
if __name__ == '__main__':
main()
转载请注明来源,欢迎对文章中的引用来源进行考证,欢迎指出任何有错误或不够清晰的表达。可以在下面评论区评论,也可以邮件至 2621041184@qq.com