python 抓取图片,python爬取网页图片,****# -*-
python 抓取图片,python爬取网页图片,****# -*-
****
# -*- coding:utf-8 -*-
from urllib import request
from bs4 import BeautifulSoup
import re
import time
url = "https://www.zhihu.com/question/22918070"
html = request.urlopen(url).read().decode(‘utf-8‘)
soup = BeautifulSoup(html,‘html.parser‘)
#print(soup.prettify())
#用Beautiful Soup结合正则表达式来提取包含所有图片链接(img标签中,class=**,以.jpg结尾的链接)的语句
links = soup.find_all(‘img‘, "origin_image zh-lightbox-thumb",src=re.compile(r‘.jpg$‘))
print(links)
# 设置保存图片的路径,否则会保存到程序当前路径
path = r‘/home/kong/PycharmProjects/untitled2/image/‘ #路径前的r是保持字符串原始值的意思,就是说不对其中的符号进行转义
for link in links:
print(link.attrs[‘src‘])
#保存链接并命名,time.time()返回当前时间戳防止命名冲突
request.urlretrieve(link.attrs[‘src‘],path+‘\%s.jpg‘ % time.time()) #使用request.urlretrieve直接将所有远程链接数据下载到本地
*****
import csv
import requests
import re
import urllib
from collections import namedtuple
from lxml import etree
from bs4 import BeautifulSoup
def schedule(blocknum, blocksize, totalsize):
"""
blocknum: 123
blocksize: 456
totalsize: 789
"""
per = 100.0 * blocknum * blocksize / totalsize
if per > 100:
per = 100
print "current download schedule: %d" % per
user_agent = ‘Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0‘
headers = {‘User-Agent‘: user_agent}
r = requests.get(‘http://www.win4000.com/wallpaper_detail_118605.html‘, headers=headers)
# r = requests.get(‘http://www.ivsky.com/tupian/ziranfengguang/‘, headers=headers)
html = etree.HTML(r.text)
# u = html.xpath(‘.//*[@class="imgitem"]‘)
img_urls = html.xpath(‘.//img/@src‘)
# img_urls = html.xpath(‘.//img/@src‘)
i = 0
for img_url in img_urls:
urllib.urlretrieve(img_url, ‘img‘+str(i) + ‘.jpg‘, schedule)
i += 1
python 抓取图片
相关内容
- linux下多python版本,,python -v
- python 学习_collection,,collection
- 《Python金融实战》中文版PDF+英文版PDF+源代码,,下载:
- python简介,,python是一门
- 小贞贞jmeter基于python的分布式,pythonjmeter,jmeter进行分布
- Windows : python+selenium(1)下载安装,python3selenium,1、环境
- Python第二周习题集(一),,# 排列组合c(n,
- C#调用带参数的python脚本,,问题描述:使用C#调
- [tools]python的mkdocs模块分分钟将md搞成一个网站,mkdocs
- python附录-builtins.py模块str类源码(含str官方文档链接)
评论关闭