抓取点东西,抓取点,#encoding=ut
文章由Byrx.net分享于2019-03-23 07:03:36
抓取点东西,抓取点,#encoding=ut
#encoding=utf8import urllibfrom bs4 import BeautifulSoupfrom bs4.element import NavigableStringdef strip_tags(t, invalid_tags): s="" for e in t: c=e if not isinstance(e, NavigableString): c = strip_tags(e.contents, invalid_tags) s += unicode(c.strip()) return sfw=open(u"找老公的.txt",'w')host='club.iweihai.cn'f=urllib.urlopen('http://%s/thread-htm-fid-122-type-183-type-183.html#tabA' % host)content= f.read().decode('gbk')bs=BeautifulSoup(content)rows=bs.select("#threadlist .tr4")[0].find_next_siblings()for i in range(len(rows)): r=rows[i].select(".subject_t")[0] iurl='http://'+host+'/'+r['href'] ff=urllib.urlopen(iurl) content2= ff.read().decode('gbk') bs2=BeautifulSoup(content2) print '#%d'%i,r.string,iurl fw.write('#%d %s\\n' % (i,r.string.encode('utf8'))) fw.write('-'*80+'\\n') r2=bs2.select("#read_tpc")[0] fw.write(strip_tags(r2.contents,[]).encode('utf8')+'\\n') fw.write('-'*80+'\\n')fw.close()#该片段来自于http://byrx.net
评论关闭