python对大文件进行排序,python文件排序,import oscla
python对大文件进行排序,python文件排序,import oscla
import osclass FileSort(object): def __init__(self, inFile, outFile=None, splitSize=20): """ split size (in MB) """ self._inFile = inFile if outFile is None: self._outFile = inFile else: self._outFile = outFile self._splitSize = splitSize * 1000000 self.setKeyExtractMethod() def setKeyExtractMethod(self, keyExtractMethod=None): """ key extract from line for sort method: def f(line): return line[1:3], line[5:10] """ if keyExtractMethod is None: self._getKey = lambda line: line else: self._getKey = keyExtractMethod def sort(self): files = self._splitFile() if files is None: """ file size <= self._splitSize """ self._sortFile(self._inFile, self._outFile) return for fn in files: self._sortFile(fn) self._mergeFiles(files) self._deleteFiles(files) def _sortFile(self, fileName, outFile=None): lines = open(fileName).readlines() get_key = self._getKey data = [(get_key(line), line) for line in lines if line!=''] data.sort() lines = [line[1] for line in data] if outFile is not None: open(outFile, 'w').write(''.join(lines)) else: open(fileName, 'w').write(''.join(lines)) def _splitFile(self): totalSize = os.path.getsize(self._inFile) if totalSize <= self._splitSize: # do not split file, the file isn't so big. return None fileNames = [] fn,e = os.path.splitext(self._inFile) f = open(self._inFile) try: i = size = 0 lines = [] for line in f: size += len(line) lines.append(line) if size >= self._splitSize: i += 1 tmpFile = fn + '.%03d' % i fileNames.append(tmpFile) open(tmpFile,'w').write(''.join(lines)) del lines[:] size = 0 if size > 0: tmpFile = fn + '.%03d' % (i+1) fileNames.append(tmpFile) open(tmpFile,'w').write(''.join(lines)) return fileNames finally: f.close() def _mergeFiles(self, files): files = [open(f) for f in files] lines = [] keys = [] for f in files: l = f.readline() lines.append(l) keys.append(self._getKey(l)) buff = [] buffSize = self._splitSize/2 append = buff.append output = open(self._outFile,'w') try: key = min(keys) index = keys.index(key) get_key = self._getKey while 1: while key == min(keys): append(lines[index]) if len(buff) > buffSize: output.write(''.join(buff)) del buff[:] line = files[index].readline() if not line: files[index].close() del files[index] del keys[index] del lines[index] break key = get_key(line) keys[index] = key lines[index] = line if len(files)==0: break # key != min(keys), see for new index (file) key = min(keys) index = keys.index(key) if len(buff)>0: output.write(''.join(buff)) finally: output.close() def _deleteFiles(self, files): for fn in files: os.remove(fn) def sort(inFileName, outFileName=None, getKeyMethod=None): fs = FileSort(inFileName, outFileName) if getKeyMethod is not None: fs.setKeyExtractMethod(getKeyMethod) fs.sort() fs = None
I didn't find such sorting algorythmus. I'm not shure about the performance,maybe somebody has better/faster solution.
相关内容
- python查看文件的alpha通道,,通过命令行参数,指定检测
- Python 操作 Windows 粘贴板,pythonwindows,Python 操作 Wi
- 自动登陆人人网 输出好友列表,自动登陆好友列表,自
- 在python中动态执行包含python脚本的字符串,python脚本
- wxPython 如何使用windows系统剪贴板,wxpython剪贴板,[Pyth
- python模拟登陆163获取邮件列表,python邮件列表,通过登陆
- Django CheatSheet,djangocheatsheet,from django.
- python socket.shutdown用法解释,,# -*- coding
- python遍历文件,python遍历,下面的python代码片
- jython编译java代码并执行,jython编译java,下面的代码片段
评论关闭