# Author: 書香之家版主 nearby, November 2022 # # This program allows you to analyze the activities of all the users in a WXC 論壇, for example, 書香之家(sxsj). # It counts the numbers of 主帖 and 跟帖 respectively for each user. # The result is printed into a .CSV file. Note, to view the Chinese characters, CSV file is not good. # So, you can view the result using Notepad or other text editor and then copy/paste the result into an Excel file. # # import requests import datetime # import sys # users: a dictionary. key=username, value = list. Inside the list, the first element is the number of 主帖 # the second element is the number of 跟帖 # html: the current web page # fromDay: the starting search date. The search is from current to this fromDay. If a primary post is before this date # then return False immediately, otherwise, always return True def processOneFile(us_dict, html, fromDay): gogo = True all = html.text.split('\n') # I forget why I did it this way :-( length = len(all) i = 0 while i < length: line = all[i].strip() jump = 6 if line == '<!-- -->': # this starts a 主帖 i = i + 1 line = all[i].strip() if line == '<!-- 列表中插廣告 -->': jump = 9 i = i + jump #print(all[i].strip()) # this is a 主帖. get the user name first i = i + 3 # print(all[i].strip()) # the line looks like: <a class="b" href="https://passport.wenxuecity.com/members/index.php?act=profile&cid=ling_yin_shi">ling_yin_shi</a> user = all[i].strip().split('>')[1].split('<')[0] # now, get the date. Note, for those with blog, it should be i=i+19. However, # for those without blog, it should be i=i+15. Thus, it is not a good idea to jump, # instead, I should search for the line looking like " 11/08/2022 ", ends with #i = i + 19 #print(all[i].strip()) #print("\n") i = i + 1 while all[i].endswith(' ') == False: i = i + 1 # format is like: 11/07/2022 ld = all[i].strip().split(' ')[0].split('/') print(ld[2]+"-"+ld[0]+"-"+ld[1]) # print(ld) today = datetime.datetime(int(ld[2]),int(ld[0]),int(ld[1])) # ok = today >= fromDay # print("ok="+str(ok)) # print("\n") if today >= fromDay: # add one for this user on his or her 主帖 if user in us_dict: L = us_dict[user] L[0] = L[0] + 1 else: L = [1,0] us_dict[user] = L # Now, process on the 跟帖 i = i + 1 line = all[i].strip() while line != '</div>': # target this line: <a class="b" href="https://passport.wenxuecity.com/members/index.php?act=profile&cid=FionaRawson">FionaRawson</a> - if line.startswith('<a class="b" href='): sub_user = line.split('>')[1].split('<')[0] # add one for this user on his/her 跟帖. Here, the guanshui variable is used. if sub_user != user or guanshui == False: if sub_user in us_dict: L = us_dict[sub_user] L[1] = L[1] + 1 else: L = [0, 1] us_dict[sub_user] = L i = i + 1 line = all[i].strip() else: gogo = False return gogo i = i + 1 return gogo # ---- main starts here ---- print() print('# Author: 書香之家版主 nearby, August 2022 version 1, November 2022 version 2. This is V2') print() subid = 'sxsj' temp = input('What is the name of your 論壇 in English? For example, 書香之家 is sxsj, 美語世界 is mysj, 文化走廊 is culture, 詩詞欣賞 is poetry: ') if len(temp) >= 2: subid = temp fromdd = "2022-10-01" print('The search is from today to a date in the past, i.e. the search is backward to the history.') print('For example, the program can search from today back to 2022-01-01. It first search for the current page, ') print('then it goes to the next page, until it goes beyond 2022-01-01. In this case, it stops when it ') print('runs into a 主帖 that is published before 2022-01-01.') temp = input('Searching from today to which date in the past? Please enter the date in the format like: 2022-01-01: ') if len(temp) >= 2: fromdd = temp.strip() print("fromDate =" + fromdd) templl = fromdd.split('-') # print(templl[0]) # print(templl[1]) # print(templl[2]) fromDate = datetime.datetime(int(templl[0]), int(templl[1]), int(templl[2])) guanshui = False # Use this variable because of kirn's talking about 灌水 :-) temp = input('Discard those 跟帖 that a user made after his/her own post? (1=yes, 0=no, default=0)\n' + 'Sometimes a user only post 跟帖 after his/her own 主帖. If yes, then such 跟帖 will be discarded. ') if len(temp) > 0 and int(temp) > 0: guanshui = True print('guanshui='+str(guanshui)) users = dict() i = 1 goOn = True while goOn: url = 'https://bbs.wenxuecity.com/' + subid + '/?page=' + str(i) i = i + 1 f = requests.get(url) goOn = processOneFile(users, f, fromDate) #print("goon="+str(goOn)) print("\n---------------\n") ks = users.keys() html2 = open('sxzj-out.csv', 'w', encoding='utf-8') for u in ks: L = users[u] print(u + ',' + str(L[0]) + ',' + str(L[1])) html2.write(u + ',' + str(L[0]) + ',' + str(L[1]) + '\n') html2.close() print("\n") print("\n") print("Please check the file sxzj-out.csv. The result is in it! Thanks for using this program. ---- 虎哥 / Nearby / 鄰兄 / 近兄")
統計網友活動的python程序。鄰兄拒不跟帖,放在這裏當做存根,須用的請自用
所有跟帖:
•
盲讚。鄰兄太nice。:)
-塵凡無憂-
♀
(0 bytes)
()
08/28/2022 postreply
11:09:13
•
你真黑!
-kirn-
♀
(0 bytes)
()
08/28/2022 postreply
11:41:41
•
隻能佩服了。。。。借這裏和無憂說一下,無憂之前提過延長新冠活動一個星期,我想了想,
-FionaRawson-
♀
(314 bytes)
()
08/28/2022 postreply
12:19:12
•
啊,我剛才在上麵說都沒看到你這個。。。心有靈犀握握手。:)
-塵凡無憂-
♀
(0 bytes)
()
08/28/2022 postreply
12:26:59
•
活動延長到9月10號。我知道高妹還有很多想說的。。。不過你看自己的時間安排。:)
-塵凡無憂-
♀
(0 bytes)
()
08/28/2022 postreply
12:41:34
•
謝謝
-FionaRawson-
♀
(0 bytes)
()
08/28/2022 postreply
12:51:39
•
服,黑,了!
-lovecat08-
♀
(0 bytes)
()
08/28/2022 postreply
12:27:35
•
虎哥,活雷鋒英文怎麽翻?:)
-妖妖靈-
♀
(0 bytes)
()
08/28/2022 postreply
14:03:45
•
讚
-望沙-
♀
(0 bytes)
()
08/28/2022 postreply
14:33:55
•
這是功力。也是愛呀,:)
-ling_yin_shi-
♂
(32 bytes)
()
08/28/2022 postreply
16:37:31
•
更新後更好用些, 方便做統計工作。鄰兄記載於此,2022-11月-08
-nearby-
♂
(0 bytes)
()
11/08/2022 postreply
20:10:13