python同步重庆时时彩的数据到本地,,依赖mysql,requ


依赖mysql,requests,torndb,从cp.360.com同步指定日期开始的时时彩数据回本地数据库中,支持增量更新,实时同步。

1.[代码] 时时彩数据爬虫

#!/usr/bin/env python#-*-coding:utf-8-*-"""    从 cp.360.cn 同步时时彩的数据到表 haoma"""import datetimefrom datetime import timedeltaimport timeimport torndbimport sysimport requestsimport refrom mylogger import get_loggerreload(sys)sys.setdefaultencoding('utf-8')DBHOST = "localhost:3306"SCHEMA = "CAIPIAO"DBUSER = "user"DBPASSWD = "passwd"db = torndb.Connection(host=DBHOST, database=SCHEMA, user=DBUSER, password=DBPASSWD)cplog = get_logger("caipiao")class Data_Sync(object):    ssc_re = re.compile(r'<td class=\'gray\'>(.*?)</td>(<td class=\'red big\'>|<td style=\'width:65px\'>)(.*?)</td>.*?<tr>')    def __init__(self, start_date="20150101", sleep_secs = 10, run_ever=True):        self.start_date = start_date if start_date > "20130101" else "20150101"        self.run_ever = run_ever        self.base_url = "http://chart.cp.360.cn/kaijiang/kaijiang?lotId=255401&spanType=2&span="        self.latest_date = ''        self.latest_period = ''        self.need_sleep = False        self.sleep_secs = sleep_secs    def run(self):        while True:            if self.need_sleep:                time.sleep(self.sleep_secs)                self.need_sleep = False            else:                self.sync_data_from_360()    def sync_data_from_360(self):        """ 根据数据库中最新一条数据,从 cp.360.com 同步数据至最新数据 """        self.get_latest_haoma_from_mysql()        if not self.latest_date:            cplog.info("db has no data, so start at {0}".format(self.start_date))            self.latest_date = self.start_date            self.latest_period = "000"        cplog.info("in db, item_date={0}, period={1}".format(self.latest_date, self.latest_period))        if self.latest_date:            cur_date = datetime.datetime.utcnow() + timedelta(hours=8)            latest_date = datetime.datetime.strptime(self.latest_date, "%Y%m%d")            """                更新规则:                1、检查是否同一天,如果不是,就下载数据,执行步骤2,增加天数,直到数据库日期与当前日一致;                2、检查数据库中的期数与下载回来的数据的最新期是否一致,一致,检查日期是否一致,是就跳过,否则插入数据;            """            dl_times = 0            while (cur_date - latest_date).days > 0:                if int(self.latest_period) < 120:                    dl_date = latest_date.strftime("%Y-%m-%d")                    dl_url = self.base_url + dl_date + "_" + dl_date                    data = self.download_with_requests(dl_url)                    if not data:                        if dl_times < 3:                            dl_times += 1                            time.sleep(2)                            continue                        else:                            latest_date += timedelta(1)                            continue                    dl_times = 0                    self.latest_date = latest_date.strftime('%Y%m%d')                    lottery_numbers = data[int(self.latest_period):]                    self.insert_into_mysql(self.latest_date, lottery_numbers)                    latest_date += timedelta(1)                else:                    latest_date += timedelta(1)                    self.latest_period = "000"            """ 更新当日数据 """            dl_date = latest_date.strftime("%Y-%m-%d")            dl_url = self.base_url + dl_date + "_" + dl_date            data = self.download_with_requests(dl_url)            if data:                lottery_numbers = data[int(self.latest_period):]                self.latest_date = latest_date.strftime('%Y%m%d')                self.insert_into_mysql(self.latest_date, lottery_numbers)    def insert_into_mysql(self, item_date, datas):        insert_datas = []        for data in datas:            period = data[0]            date_period = item_date + period            lottery_number = data[2]            if not re.search('\d+',2000 lottery_number):                continue            a, b, c, d, e = list(lottery_number)            insert_data = (item_date, period, date_period, lottery_number, a, b, c, d, e)            insert_datas.append(insert_data)        if insert_datas:            cplog.info("current insert into haoma:{0}, {1}".format(item_date, datas))            sql = "insert into haoma(item_date, period, date_period, lottery_number, a, b, c, d, e) values(%s, %s, %s, %s, %s,  %s, %s, %s, %s)"            try:                db.executemany(sql, insert_datas)            except Exception as e:                print e                sys.exit(1)        else:            cplog.info("no more new data to sync, wait for {0} seconds".format(self.sleep_secs))            self.need_sleep = True    def get_latest_haoma_from_mysql(self):        sql = "select * from haoma order by date_period desc limit 1"        ret = db.get(sql)        if ret:            self.latest_date = ret.item_date            self.latest_period = ret.period    def download_with_requests(self, url):        cplog.info("download: {0}".format(url))        data = []        try:            r = requests.get(url, timeout=10)            if r.status_code == 200:                data = self.ssc_re.findall(r.content)            else:                cplog.info("download err, http status_code:{0}".format(r.status_code))        except Exception as e:            cplog.info("call requests raise Exception: {0}".format(e))        finally:            return datadef run():    sync = Data_Sync(start_date="20140101", sleep_secs=30)    sync.run()if __name__ == "__main__":    run()

编橙之家文章,

评论关闭