Python 程序寫好了,把你的博客地址寫上,執行,會自動下載你的博客所有文章和圖片。這個程序主要是針對 tinypic.com 來的,因為那個圖片網站今年要關閉,所以把圖片都存起來。。 如果你會用Python 程序,希望試用,希望聽到你的改進意見。
""" Run the followings in CMD window: make a new directory for blog pages chcp 936 if no Chinese python wenxuecity2.py """ import urllib2, re, os # *** Modify url1 to be your wenxuecity blog *** #url1="http://blog.wenxuecity.com/myindex/33408/" #dingzhuang url1="http://blog.wenxuecity.com/myindex/24769/" #xiuyuan #url1="https://blog.wenxuecity.com/myoverview/26805/" #test #same images for page def saveHtml(page, _link, time): link = re.search('http.+html', _link).group() f1 = urllib2.urlopen(link) s1 = f1.read() #title = re.search('(?<=BLK_txta">).+(?=<)', s1).group() #print(title.decode("utf8") ) content = re.search('(?<=articalContent">).+?(?=)', s1, re.DOTALL).group().strip() #print(content.decode("utf8") ) img = re.findall('', content, re.IGNORECASE) cnt = 1 for x in img: src = re.findall('http.+jpg', x, re.IGNORECASE) if len(src) == 0: continue des = "images/P%04dI%03d.jpg" % (page, cnt) try: f1 = urllib2.urlopen(src[0]) f2 = open(des, "wb") f2.write(f1.read()) f2.close() except: pass content = content.replace(src[0], des) cnt += 1 f2 = open("P%04d.htm" % page, "wb") f2.write(_link + " " + time + "n" + content + "n") f2.close() # process pages def getPage(_page, url1): id = re.findall("[0-9]+", url1)[0] url2="http://blog.wenxuecity.com/blog/frontend.php?page=0&act=articleList&blogId=" url2 = url2.replace("0", str(_page))+id f1 = urllib2.urlopen(url2) s1 = f1.read().split("n") count = 0 for i in range(len(s1)): if s1[i].find('class="atc_title"')>0: link = s1[i+1].strip()+s1[i+2].strip()+s1[i+3].strip() count = count+1 if(count > 0): print _page*60+count, link.decode("utf8") time = re.search('(?<=>).+(?=<)', s1[i+10]).group(0) link = link.replace("/my", "http://blog.wenxuecity.com/my") saveHtml(_page*60+count, link, time) i = i+10 return count #Main loop def getBlog(url1): count = 60 _page = 0 try: os.makedirs("images") except: pass while count == 60: count= getPage(_page, url1) _page = _page + 1 getBlog(url1)