# Author: 書香之家版主 nearby, November 2022
#
# This program allows you to analyze the activities of all the users in a WXC 論壇, for example, 書香之家(sxsj).
# It counts the numbers of 主帖 and 跟帖 respectively for each user.
# The result is printed into a .CSV file. Note, to view the Chinese characters, CSV file is not good.
# So, you can view the result using Notepad or other text editor and then copy/paste the result into an Excel file.
#
#
import requests
import datetime
# import sys
# users: a dictionary. key=username, value = list. Inside the list, the first element is the number of 主帖
# the second element is the number of 跟帖
# html: the current web page
# fromDay: the starting search date. The search is from current to this fromDay. If a primary post is before this date
# then return False immediately, otherwise, always return True
def processOneFile(us_dict, html, fromDay):
gogo = True
all = html.text.split('\n') # I forget why I did it this way :-(
length = len(all)
i = 0
while i < length:
line = all[i].strip()
jump = 6
if line == '<!-- -->': # this starts a 主帖
i = i + 1
line = all[i].strip()
if line == '<!-- 列表中插廣告 -->':
jump = 9
i = i + jump
#print(all[i].strip())
# this is a 主帖. get the user name first
i = i + 3
# print(all[i].strip())
# the line looks like: <a class="b" href="https://passport.wenxuecity.com/members/index.php?act=profile&cid=ling_yin_shi">ling_yin_shi</a>
user = all[i].strip().split('>')[1].split('<')[0]
# now, get the date. Note, for those with blog, it should be i=i+19. However,
# for those without blog, it should be i=i+15. Thus, it is not a good idea to jump,
# instead, I should search for the line looking like " 11/08/2022 ", ends with
#i = i + 19
#print(all[i].strip())
#print("\n")
i = i + 1
while all[i].endswith(' ') == False:
i = i + 1
# format is like: 11/07/2022
ld = all[i].strip().split(' ')[0].split('/')
print(ld[2]+"-"+ld[0]+"-"+ld[1])
# print(ld)
today = datetime.datetime(int(ld[2]),int(ld[0]),int(ld[1]))
# ok = today >= fromDay
# print("ok="+str(ok))
# print("\n")
if today >= fromDay:
# add one for this user on his or her 主帖
if user in us_dict:
L = us_dict[user]
L[0] = L[0] + 1
else:
L = [1,0]
us_dict[user] = L
# Now, process on the 跟帖
i = i + 1
line = all[i].strip()
while line != '</div>':
# target this line: <a class="b" href="https://passport.wenxuecity.com/members/index.php?act=profile&cid=FionaRawson">FionaRawson</a> -
if line.startswith('<a class="b" href='):
sub_user = line.split('>')[1].split('<')[0]
# add one for this user on his/her 跟帖. Here, the guanshui variable is used.
if sub_user != user or guanshui == False:
if sub_user in us_dict:
L = us_dict[sub_user]
L[1] = L[1] + 1
else:
L = [0, 1]
us_dict[sub_user] = L
i = i + 1
line = all[i].strip()
else:
gogo = False
return gogo
i = i + 1
return gogo
# ---- main starts here ----
print()
print('# Author: 書香之家版主 nearby, August 2022 version 1, November 2022 version 2. This is V2')
print()
subid = 'sxsj'
temp = input('What is the name of your 論壇 in English? For example, 書香之家 is sxsj, 美語世界 is mysj, 文化走廊 is culture, 詩詞欣賞 is poetry: ')
if len(temp) >= 2:
subid = temp
fromdd = "2022-10-01"
print('The search is from today to a date in the past, i.e. the search is backward to the history.')
print('For example, the program can search from today back to 2022-01-01. It first search for the current page, ')
print('then it goes to the next page, until it goes beyond 2022-01-01. In this case, it stops when it ')
print('runs into a 主帖 that is published before 2022-01-01.')
temp = input('Searching from today to which date in the past? Please enter the date in the format like: 2022-01-01: ')
if len(temp) >= 2:
fromdd = temp.strip()
print("fromDate =" + fromdd)
templl = fromdd.split('-')
# print(templl[0])
# print(templl[1])
# print(templl[2])
fromDate = datetime.datetime(int(templl[0]), int(templl[1]), int(templl[2]))
guanshui = False # Use this variable because of kirn's talking about 灌水 :-)
temp = input('Discard those 跟帖 that a user made after his/her own post? (1=yes, 0=no, default=0)\n' +
'Sometimes a user only post 跟帖 after his/her own 主帖. If yes, then such 跟帖 will be discarded. ')
if len(temp) > 0 and int(temp) > 0:
guanshui = True
print('guanshui='+str(guanshui))
users = dict()
i = 1
goOn = True
while goOn:
url = 'https://bbs.wenxuecity.com/' + subid + '/?page=' + str(i)
i = i + 1
f = requests.get(url)
goOn = processOneFile(users, f, fromDate)
#print("goon="+str(goOn))
print("\n---------------\n")
ks = users.keys()
html2 = open('sxzj-out.csv', 'w', encoding='utf-8')
for u in ks:
L = users[u]
print(u + ',' + str(L[0]) + ',' + str(L[1]))
html2.write(u + ',' + str(L[0]) + ',' + str(L[1]) + '\n')
html2.close()
print("\n")
print("\n")
print("Please check the file sxzj-out.csv. The result is in it! Thanks for using this program. ---- 虎哥 / Nearby / 鄰兄 / 近兄")