From 2a9a0320eb3239b278e6ff75e85c39cd137f5050 Mon Sep 17 00:00:00 2001 From: Zhouzuo <2232969133@qq.com> Date: Mon, 17 Apr 2017 22:42:55 +0800 Subject: [PATCH 1/2] Update SBB.py python 2 change python 3 --- SBB.py | 50 ++++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/SBB.py b/SBB.py index 13ff48a..6683ec4 100644 --- a/SBB.py +++ b/SBB.py @@ -1,7 +1,7 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8 -*- -__version__ = '0.01' +__version__ = '0.02' __author__ = 'Julien G. (@bfishadow)' ''' @@ -10,20 +10,20 @@ Or simply save them anywhere as archives. ''' -import sys, urllib2 +import sys, urllib.request from time import strftime def getBetween(str, str1, str2): strOutput = str[str.find(str1)+len(str1):str.find(str2)] return strOutput -strUsage = "Usage: SBB.py [asc]\n\nExample:\nSBB.py http://blog.sina.com.cn/gongmin desc\nSBB.py http://blog.sina.com.cn/u/1239657051\n" +strUsage = "Usage: SBB.py [asc]\n\nExample:\nSBB.py http://blog.sina.com.cn/gongmin desc\nSBB.py http://blog.sina.com.cn/u/1239657051\n" #Step 0: get target blog homepage URL try : - strUserInput = sys.argv[1] + strUserInput =sys.argv[1] except : - print strUsage + print (strUsage) sys.exit(0) try : @@ -33,18 +33,20 @@ def getBetween(str, str1, str2): #The URL *must* start with http://blog.sina.com.cn/, otherwise the universe will be destroied XD if strUserInput.find("http://blog.sina.com.cn/") == -1 or len(strUserInput) <= 24 : - print strUsage + print(strUsage) + print(strUserInput) sys.exit(0) #Get UID for the blog, UID is critical. -objResponse = urllib2.urlopen(strUserInput) -strResponse = objResponse.read() +objResponse = urllib.request.urlopen(strUserInput) +strResponse = objResponse.read().decode('utf-8') objResponse.close() strUID = getBetween(getBetween(strResponse, "format=html5;", "format=wml;"), "/blog/u/", '">') - +print('用户ID '+strUID) + if len(strUID) > 10 : - print strUsage + print(strUsage) sys.exit(0) #Here's the UID. Most of the UID is a string of ten digits. @@ -54,8 +56,8 @@ def getBetween(str, str1, str2): #Step 1: get list for first page and article count strTargetBlogListURL = "http://blog.sina.com.cn/s/articlelist_" + strTargetUID + "_0_1.html" -objResponse = urllib2.urlopen(strTargetBlogListURL) -strResponse = objResponse.read() +objResponse = urllib.request.urlopen(strTargetBlogListURL) +strResponse = objResponse.read().decode('utf-8') objResponse.close() strBlogPostList = getBetween(getBetween(strResponse,"$blogArticleSortArticleids","$blogArticleCategoryids"), " : [", "],") @@ -71,8 +73,8 @@ def getBetween(str, str1, str2): #Step 2: get list for the rest of pages for intCurrentPage in range(intPageCount - 1) : strTargetBlogListURL = "http://blog.sina.com.cn/s/articlelist_" + strTargetUID + "_0_" + str(intCurrentPage + 2) + ".html" - objResponse = urllib2.urlopen(strTargetBlogListURL) - strResponse = objResponse.read() + objResponse = urllib.request.urlopen(strTargetBlogListURL) + strResponse = objResponse.read().decode('utf-8') strBlogPostList = getBetween(getBetween(strResponse,"$blogArticleSortArticleids","$blogArticleCategoryids"), " : [", "],") strBlogPostID = strBlogPostID + "," + strBlogPostList objResponse.close() @@ -93,8 +95,8 @@ def getBetween(str, str1, str2): for strCurrentBlogPostID in arrBlogPost : intCounter = intCounter + 1 strTargetBlogPostURL = "http://blog.sina.com.cn/s/blog_" + strCurrentBlogPostID + ".html" - objResponse = urllib2.urlopen(strTargetBlogPostURL) - strPageCode = objResponse.read() + objResponse = urllib.request.urlopen(strTargetBlogPostURL) + strPageCode = objResponse.read().decode('utf-8') objResponse.close() #Parse blog title @@ -113,17 +115,17 @@ def getBetween(str, str1, str2): #Write into local file strLocalFilename = "Post_" + str(intCounter) + "_" + strCurrentBlogPostID + ".html" - strHTML4Post = "\n\n\n" + strBlogPostTitle + "\n\n\n\n

" + strBlogPostTitle + "

\n

By: " + strBlogName + " 原文发布于:" + strBlogPostTime + "

\n" + strBlogPostBody + "\n

返回目录

\n\n" - objFileArticle = open(strLocalFilename, "w") - objFileArticle.write(strHTML4Post); + strHTML4Post = '\n\n\n' + strBlogPostTitle + '\n\n\n\n

' + strBlogPostTitle + "

\n

By: " + strBlogName + " 原文发布于:" + strBlogPostTime + "

\n" + strBlogPostBody + '\n

返回目录

\n\n\n' + objFileArticle = open(strLocalFilename, "wb") + objFileArticle.write(strHTML4Post.encode('utf-8')); objFileArticle.close strHTML4Index = strHTML4Index + '
  • ' + strBlogPostTitle + '
  • \n' - print intCounter , "/", intBlogPostCount + print (intCounter , "/", intBlogPostCount) strCurrentTimestamp = str(strftime("%Y-%m-%d %H:%M:%S")) -strHTML4Index = "\n\n\n" + strBlogName + "博客文章汇总\n\n\n

    新浪博客:" + strBlogName + "

    \n

    共" + str(intBlogPostCount) + "篇文章,最后更新:" + strCurrentTimestamp + "

    \n
      \n" + strHTML4Index + "\n
    \n\n" -objFileIndex = open("index.html", "w") -objFileIndex.write(strHTML4Index); +strHTML4Index = '\n\n\n' + strBlogName + "博客文章汇总\n\n\n

    新浪博客:" + strBlogName + "

    \n

    共" + str(intBlogPostCount) + "篇文章,最后更新:" + strCurrentTimestamp + "

    \n
      \n" + strHTML4Index + "\n
    \n\n\n" +objFileIndex = open("index.html", "wb") +objFileIndex.write(strHTML4Index.encode('utf-8')); objFileIndex.close From 9b636e1059da9a62c4546a5612c92ccbc8b0fb11 Mon Sep 17 00:00:00 2001 From: Zhouzuo <2232969133@qq.com> Date: Mon, 17 Apr 2017 22:49:06 +0800 Subject: [PATCH 2/2] Update Readme.md --- Readme.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Readme.md b/Readme.md index 6375430..d0ccf26 100644 --- a/Readme.md +++ b/Readme.md @@ -4,7 +4,7 @@ Based on these downloaded HTML files, you may generate an ebook by importing into [Calibre](http://calibre-ebook.com/). Or, you can simply save them anywhere as archives. -Tested with Python 2.7.8 +Tested with Python 3.5.2 ## Usage SBB.py (Sina Blog URL) (asc|desc) @@ -27,7 +27,7 @@ Licensed under the Apache License, Version 2.0 ##Change log -###Feb 15, 2015 +###April 17, 2017 - [ADDED] timestamp for index and articles. - [ADDED] sort option. Ascending by default. @@ -38,7 +38,7 @@ Licensed under the Apache License, Version 2.0 基于这些下载来的 HTML 文件,您可以借助 [Calibre](http://calibre-ebook.com/) 来生成电子书,或者当作存档。 -请在 Python 2.7.8 下使用。 +请在 Python 3.5.2 下使用。 ## 用法 SBB.py (新浪博客地址) (desc|asc) @@ -61,7 +61,7 @@ Licensed under the Apache License, Version 2.0 ##升级日志 -###2015年2月15日 +###2017年4月17日 - [增加] 索引页面和文章页面增加时间戳。 - [增加] 文章排序选项,默认按发表时间顺序排列。