Python Web数据抓取(xpath版)


这个版本较之前的“正则表达式版”而言,主要有以下几个改进:

(1)采用SQLite缓存抓取的HTML页面,大大提高了二次数据处理的效率。第一次运行程序大约耗时6小时,以后只需3分钟左右即可完成。
(2)采用xpath替换之前的正则表达式进行HTML解析。xpath定位更加简单、方便,而且能够自动修正html错误语法。xpath真强大!!
(3)去掉了重复的结果。


程序代码如下:

# coding:utf-8
# Practice of scraping web data with xpath
# by redice 2010.11.05

import codecs
import sys 
reload(sys) 
sys.setdefaultencoding(utf-8)

import urllib2
from urllib2 import URLError, HTTPError
import zlib
import sqlite3

try:
    import cPickle as pickle
except ImportError:
    import pickle
   

conn = sqlite3.connect("html_cache.db")
conn.text_factory = lambda x: unicode(x, utf-8, replace)
curs = conn.cursor()

#if htmls tables not exist,create it
#curs.execute(CREATE TABLE if not exists htmls(url VARCHAR(255) UNIQUE,content BLOG,size INTEGER);)
curs.execute(CREATE TABLE if not exists htmls(url VARCHAR(255) UNIQUE,content TEXT,size INTEGER);)
conn.commit()

def serialize(value):
    """convert object to a compressed pickled string to save in the db
    """
    #return sqlite3.Binary(zlib.compress(pickle.dumps(value, protocol=pickle.HIGHEST_PROTOCOL), 5))
    #return sqlite3.Binary(value)
    return value
   
def deserialize(value):
    """convert compressed pickled string from database back into an object
    """
    #return pickle.loads(zlib.decompress(value)) if value else value
    return value

# Fetch the target html
def gethtml(url):
    Fetch the target html
    try:
        # look up the html_cache.db first

        curs.execute("select * from htmls where url=?;" ,(url,))
        row = curs.fetchone()
        if row:
            # find the target
            #print deserialize(str(row[1]))
            return deserialize(str(row[1]))

        response = urllib2.urlopen(url)
        result = response.read()
        # insert into the html_cache.db
        curs.execute("insert into htmls values(?,?,?);", (url,serialize(result),len(result)))
        conn.commit()
       
        print "saved %s into html_cache.db" % (url)
       
        return  result
    except URLError, e:
        if hasattr(e, reason):
            print Failed to reach a server.
            print Reason: , e.reason
            return None
        elif hasattr(e, code):
            print The server couldnt fulfill the request.
            print Error code: , e.code
            return None
    #except:
        #return None
   
# end def gethtml


import re

#Fetch the all string matched. Return a list.
def regexmatch(rule,str):
    Fetch the all string matched. Return a list.
    p = re.compile(rule)
    return p.findall(str)
#end def regexmatch


# decodeHtmlEntity
def decodeHtmlEntity(s) :
    decodeHtmlEntity
    if s== or not s:
       return
    result = s
   
    import locale
    result = result.decode(locale.getdefaultlocale()[1],"ignore").encode(locale.getdefaultlocale()[1]).replace("xc2xa0"," ")
   
    return result
# end def decodeHtmlEntity

#final result
dining_db = []

total = 0;

#debug
debug = 0

# Fetch menupalace.coms html
print Fetching html from http://menupalace.com ...
html = gethtml(http://menupalace.com)

from lxml import etree

if html== or  html==None:
    print "Cant get them html from http://menupalace.com"
    sys.exit()

try:
    tree = etree.HTML(html)
    nodes = tree.xpath("//table[@class=n_table]")
except:
    f = open("log.txt","wa")
    f.write(html)
    print("error to resolve the html http://menupalace.com")
    sys.exit()

for node in nodes:
    if debug and total>=10:
        break;

    n = node.xpath("./tr[1]/td[1]/img")
    # Fetch country
    country = ""
    if len(n)>0:
        country = decodeHtmlEntity(n[0].tail)
        country = country.strip()

    # Fetch all link   
    ls = node.xpath(".//a")

    # Through all link
    for l in ls:
        if debug and total>=10:
            break;
       
        #city
        city = decodeHtmlEntity(l.text)
        city = city.strip()
       
        prelink = l.get("href")
        link = prelink + "restaurants/restaurants.aspx"

        #print Fetching html from + link + ...
        html = gethtml(link)
        if html== or html == None:
            print "Cant get them html from " + link
            continue
       
        try:
            subtree = etree.HTML(html)
            subnodes = subtree.xpath("//td[@class=frame_style_padding]")
        except:
            if debug:
   &nbs

相关内容

    暂无相关文章

评论关闭