一个极其简洁的Python网页抓取程序
一个极其简洁的Python网页抓取程序
代码如下:
import time import urllib import sys import string from HTMLParser import HTMLParser ticker_list = [socl, ibb, pnqi, eirl, ita, iai, xsd, vbk, dfe, qqq, ewi, pbd] ticker = class MyHTMLParser(HTMLParser): previous_data = text_before_amp = def handle_data(self, data): starttag_text = self.get_starttag_text() if -1!=string.find(data, ((%s) % ticker).upper()) and -1!=string.find(starttag_text,
): sys.stdout.write(time.strftime(%d/%m/%Y) + ) if !=self.text_before_amp: sys.stdout.write(self.text_before_amp + &) sys.stdout.write(data) if -1!=string.find(str(starttag_text), yfs_g53_%s % ticker.lower()) and -1==string.find(data, -): sys.stdout.write( + data) if -1!=string.find(str(starttag_text), yfs_h53_%s % ticker.lower()): print , data self.previous_data = data def handle_entityref(self, name): if amp==name and -1!=string.find(self.get_starttag_text(),
): self.text_before_amp = self.previous_data for t in ticker_list: ticker = t parser = MyHTMLParser() f = urllib.urlopen(http://finance.yahoo.com/q?s=%s % ticker) html_string = f.read() parser.feed(html_string)
示例输出:
03/05/2014 Global X Social Media Index ETF (SOCL) 17.41 17.83 03/05/2014 iShares Nasdaq Biotechnology (IBB) 227.70 233.07 03/05/2014 PowerShares NASDAQ Internet (PNQI) 62.15 62.98 03/05/2014 iShares MSCI Ireland Capped (EIRL) 38.28 38.79 03/05/2014 iShares US Aerospace & Defense (ITA) 108.32 109.70 03/05/2014 iShares US Broker-Dealers (IAI) 37.46 37.89 03/05/2014 SPDR S&P Semiconductor ETF (XSD) 66.89 67.59 03/05/2014 Vanguard Small Cap Growth ETF (VBK) 119.46 120.70 03/05/2014 WisdomTree Europe SmallCap Dividend (DFE) 62.02 62.61 03/05/2014 PowerShares QQQ (QQQ) 87.28 88.11 03/05/2014 iShares MSCI Italy Capped (EWI) 17.89 18.03 03/05/2014 PowerShares Global Clean Energy (PBD) 13.02 13.08
评论关闭