Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

Based on these downloaded HTML files, you may generate an ebook by importing into [Calibre](http://calibre-ebook.com/). Or, you can simply save them anywhere as archives.

Tested with Python 2.7.8
Tested with Python 3.5.2

## Usage
SBB.py (Sina Blog URL) (asc|desc)
Expand All @@ -27,7 +27,7 @@ Licensed under the Apache License, Version 2.0

##Change log

###Feb 15, 2015
###April 17, 2017

- [ADDED] timestamp for index and articles.
- [ADDED] sort option. Ascending by default.
Expand All @@ -38,7 +38,7 @@ Licensed under the Apache License, Version 2.0

基于这些下载来的 HTML 文件,您可以借助 [Calibre](http://calibre-ebook.com/) 来生成电子书,或者当作存档。

请在 Python 2.7.8 下使用。
请在 Python 3.5.2 下使用。

## 用法
SBB.py (新浪博客地址) (desc|asc)
Expand All @@ -61,7 +61,7 @@ Licensed under the Apache License, Version 2.0

##升级日志

###2015年2月15日
###2017年4月17日

- [增加] 索引页面和文章页面增加时间戳。
- [增加] 文章排序选项,默认按发表时间顺序排列。
50 changes: 26 additions & 24 deletions SBB.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

__version__ = '0.01'
__version__ = '0.02'
__author__ = 'Julien G. (@bfishadow)'

'''
Expand All @@ -10,20 +10,20 @@
Or simply save them anywhere as archives.
'''

import sys, urllib2
import sys, urllib.request
from time import strftime

def getBetween(str, str1, str2):
strOutput = str[str.find(str1)+len(str1):str.find(str2)]
return strOutput

strUsage = "Usage: SBB.py <Sina blog URL> [asc]\n\nExample:\nSBB.py http://blog.sina.com.cn/gongmin desc\nSBB.py http://blog.sina.com.cn/u/1239657051\n"
strUsage = "Usage: SBB.py <Sina blog URL> [asc]\n\nExample:\nSBB.py http://blog.sina.com.cn/gongmin desc\nSBB.py http://blog.sina.com.cn/u/1239657051\n"

#Step 0: get target blog homepage URL
try :
strUserInput = sys.argv[1]
strUserInput =sys.argv[1]
except :
print strUsage
print (strUsage)
sys.exit(0)

try :
Expand All @@ -33,18 +33,20 @@ def getBetween(str, str1, str2):

#The URL *must* start with http://blog.sina.com.cn/, otherwise the universe will be destroied XD
if strUserInput.find("http://blog.sina.com.cn/") == -1 or len(strUserInput) <= 24 :
print strUsage
print(strUsage)
print(strUserInput)
sys.exit(0)

#Get UID for the blog, UID is critical.
objResponse = urllib2.urlopen(strUserInput)
strResponse = objResponse.read()
objResponse = urllib.request.urlopen(strUserInput)
strResponse = objResponse.read().decode('utf-8')
objResponse.close()

strUID = getBetween(getBetween(strResponse, "format=html5;", "format=wml;"), "/blog/u/", '">')

print('用户ID '+strUID)

if len(strUID) > 10 :
print strUsage
print(strUsage)
sys.exit(0)

#Here's the UID. Most of the UID is a string of ten digits.
Expand All @@ -54,8 +56,8 @@ def getBetween(str, str1, str2):
#Step 1: get list for first page and article count
strTargetBlogListURL = "http://blog.sina.com.cn/s/articlelist_" + strTargetUID + "_0_1.html"

objResponse = urllib2.urlopen(strTargetBlogListURL)
strResponse = objResponse.read()
objResponse = urllib.request.urlopen(strTargetBlogListURL)
strResponse = objResponse.read().decode('utf-8')
objResponse.close()

strBlogPostList = getBetween(getBetween(strResponse,"$blogArticleSortArticleids","$blogArticleCategoryids"), " : [", "],")
Expand All @@ -71,8 +73,8 @@ def getBetween(str, str1, str2):
#Step 2: get list for the rest of pages
for intCurrentPage in range(intPageCount - 1) :
strTargetBlogListURL = "http://blog.sina.com.cn/s/articlelist_" + strTargetUID + "_0_" + str(intCurrentPage + 2) + ".html"
objResponse = urllib2.urlopen(strTargetBlogListURL)
strResponse = objResponse.read()
objResponse = urllib.request.urlopen(strTargetBlogListURL)
strResponse = objResponse.read().decode('utf-8')
strBlogPostList = getBetween(getBetween(strResponse,"$blogArticleSortArticleids","$blogArticleCategoryids"), " : [", "],")
strBlogPostID = strBlogPostID + "," + strBlogPostList
objResponse.close()
Expand All @@ -93,8 +95,8 @@ def getBetween(str, str1, str2):
for strCurrentBlogPostID in arrBlogPost :
intCounter = intCounter + 1
strTargetBlogPostURL = "http://blog.sina.com.cn/s/blog_" + strCurrentBlogPostID + ".html"
objResponse = urllib2.urlopen(strTargetBlogPostURL)
strPageCode = objResponse.read()
objResponse = urllib.request.urlopen(strTargetBlogPostURL)
strPageCode = objResponse.read().decode('utf-8')
objResponse.close()

#Parse blog title
Expand All @@ -113,17 +115,17 @@ def getBetween(str, str1, str2):

#Write into local file
strLocalFilename = "Post_" + str(intCounter) + "_" + strCurrentBlogPostID + ".html"
strHTML4Post = "<html>\n<head>\n<meta http-equiv=""Content-Type"" content=""text/html; charset=utf-8"" />\n<title>" + strBlogPostTitle + "</title>\n<link href=""http://simg.sinajs.cn/blog7style/css/conf/blog/article.css"" type=""text/css"" rel=""stylesheet"" />\n</head>\n<body>\n<h2>" + strBlogPostTitle + "</h2>\n<p>By: <em>" + strBlogName + "</em> 原文发布于:<em>" + strBlogPostTime + "</em></p>\n" + strBlogPostBody + "\n<p><a href=""index.html"">返回目录</a></p>\n</body>\n</html>"
objFileArticle = open(strLocalFilename, "w")
objFileArticle.write(strHTML4Post);
strHTML4Post = '<html>\n<head>\n<meta charset="utf-8" />\n<title>' + strBlogPostTitle + '</title>\n<link href="http://simg.sinajs.cn/blog7style/css/conf/blog/article.css" type="text/css" rel="stylesheet" />\n</head>\n<body>\n<h2>' + strBlogPostTitle + "</h2>\n<p>By: <em>" + strBlogName + "</em> 原文发布于:<em>" + strBlogPostTime + "</em></p>\n" + strBlogPostBody + '\n<p><a href="index.html">返回目录</a></p>\n</body>\n</html>\n'
objFileArticle = open(strLocalFilename, "wb")
objFileArticle.write(strHTML4Post.encode('utf-8'));
objFileArticle.close

strHTML4Index = strHTML4Index + '<li><a href="' + strLocalFilename + '">' + strBlogPostTitle + '</a></li>\n'

print intCounter , "/", intBlogPostCount
print (intCounter , "/", intBlogPostCount)

strCurrentTimestamp = str(strftime("%Y-%m-%d %H:%M:%S"))
strHTML4Index = "<html>\n<head>\n<meta http-equiv=""Content-Type"" content=""text/html; charset=utf-8"" />\n<title>" + strBlogName + "博客文章汇总</title>\n</head>\n<body>\n<h2>新浪博客:" + strBlogName + "</h2>\n<p>共" + str(intBlogPostCount) + "篇文章,最后更新:<em>" + strCurrentTimestamp + "</em></p>\n<ol>\n" + strHTML4Index + "\n</ol>\n</body>\n</html>"
objFileIndex = open("index.html", "w")
objFileIndex.write(strHTML4Index);
strHTML4Index = '<html>\n<head>\n<meta charset="utf-8" />\n<title>' + strBlogName + "博客文章汇总</title>\n</head>\n<body>\n<h2>新浪博客:" + strBlogName + "</h2>\n<p>共" + str(intBlogPostCount) + "篇文章,最后更新:<em>" + strCurrentTimestamp + "</em></p>\n<ol>\n" + strHTML4Index + "\n</ol>\n</body>\n</html>\n"
objFileIndex = open("index.html", "wb")
objFileIndex.write(strHTML4Index.encode('utf-8'));
objFileIndex.close