語雀文章監測腳本設計與實現—yuqueGetter
0x01 功能介紹
語雀是一個很方便的知識庫整理工具,每天都會有很多小伙伴在語雀上更新自己的文章
Tide安全團隊Wiki知識庫:
https://www.yuque.com/tidesec
Tide安全團隊的小伙伴每天都會把自己的學習過程及學習成功更新到語雀中,為了更方便讀者每天閱讀小伙伴的文章,lmn在此設計了一個小程序可以每天獲取一次當天在某個語雀知識庫中的更新文章
總功能有兩個

功能實現
首先第一個功能:實現每天往相應的郵箱發送一次此知識庫所有更新的文章

第二個功能:用戶可以選擇某年某月的文章獲取更多想要的內容

0x02 具體實現代碼
引入頭文件 & 獲取當前時間,方便下面做判斷
# -*- coding = utf-8 -*-# @Time : 2022/2/28 4:12 下午# @Author : lmn# @File : yuque.py# @Software : PyCharm
import jsonfrom urllib import request, parseimport reimport scheduleimport timefrom email.mime.multipart import MIMEMultipartimport urllib.request
import sslssl._create_default_https_context = ssl._create_unverified_context
# 獲取當前時間Time = time.strftime('%Y-%m-%d', time.localtime(time.time()))
模擬頭部獲取返回信息,user-Agent 與 Cookie 可通過抓包或直接網頁獲取

# 得到一個網頁內容def askUrl(url): # 模擬頭部 head = { "User-Agent": "XXXX", "Cookie": "XXXX" } req = urllib.request.Request(url, headers=head) html = "" try: # 異常處理 response = urllib.request.urlopen(req) html = response.read().decode("utf-8") # print(html) except Exception as e: print(e) return html
判斷首頁是否更新過,如果不從首頁判斷,只能通過對每個分支的語雀WIKI庫進行判斷,可能造成大量的不必要的運行,此段目的是為了提高判斷效率,減少運行時間
部分注釋用與測試,可自行測試

判斷只判斷每個知識庫的第一條,若有今天的文章則判定這個知識庫今天更新過
雖然首頁顯示“最近更新的知識庫”,但只能顯示五條,為了提高近準讀,這里不通過“最近更新的知識庫”直接判斷
# 判斷哪個WIKI更新過def judgeUpdate(url): # 1. 獲取網頁 html = askUrl(url) data = json.loads(html) updateWikiName = [] # 已更新的WIKI名稱 updateWikiUrl = [] # 已更新的WIKI地址 updateWiki = [] # 已更新的WIKI集 for x in data['data'][1]['placements'][0]['blocks'][0]['data'][0]['books']: # 判斷主頁更新的文章 # print(x['name'], 'https://www.yuque.com/tidesec/' + x['slug'], x['summary'][0]['updated_at'][0:10]) # 判斷是否為今天 if x['summary'][0]['updated_at'][0:10] == Time: # 更新的名稱 updateWikiName.append(x['name']) # 更新的地址 updateWikiUrl.append('https://www.yuque.com/tidesec/' + x['slug']) # print("今天的文章") # print(updateWikiUrl) # 打印時間 # print(time.strftime('%Y-%m-%d', time.localtime(time.time())))
updateWiki.append(updateWikiName) updateWiki.append(updateWikiUrl) return updateWiki # 返回['Wiki名','地址']
在我們已經知道哪些數據庫更新過后,通過如下函數可以判斷某知識庫中哪些文章更新過

返回相關參數用于輸出與發送郵件
# 返回更新過的WIKI并返回固定格式def uploadArticleALL(ArticleALL): document = open("ArticleOneMonth.txt", "w+") content = [] b = '' content2 = [] for y in ArticleALL: # [[編號], [題目], [地址], [所屬WIKI], [更新日期],[時間], [作者]] y[5] = str(int(y[5][0:2]) + 8) + y[5][2:10] newUser = checkUser(y, y[4], y[5]) content2 = "WIKI編號:" + str(y[0]) + "" + "文章題目:" + y[1] + "" + "文章地址:" + y[2] + "" + "所屬WIKI:" + y[3] + "" + "更新日期:" + y[4] + "" + "更新時間:" + y[5] + "" + "創建者:" + y[6] + "" + "更新者:" + newUser + "" print(content2) content.append(content2) # print(b.join(content)) # sendEmail(b.join(content)) # print(b.join(content)) Result = b.join(content) document.write(Result) document.close() sendEmail("NULL", "file")
到這里不算完,測試時發現一個巨型坑,這里雖然通過目錄文章給出的信息獲取到作者參數,但若文章被第二作者更新,這里并不顯示
也就是最后更新的作者現在無名無姓沒有人知道(除非點進去在最下面才能看到)

這里通過設計一個checkUser函數判斷兩者最后更新的時間,通過判斷年月日小時分鐘秒來精確判斷誰是最后一個更新的
但是這里又不能說最后更新的是寫文章的人,所以我們顯示更新和創建兩個人的名字

def checkUser(updateArticleALL, articleDate, articleTime): All = [] html = askUrl(updateArticleALL[2]) # print(html) pattern = re.compile(r"decodeURIComponent\(\"(.+?)\"\)") tags = re.findall(pattern, html) # s = urllib.parse.unquote(tags[0]) data = json.loads(urllib.parse.unquote(tags[0])) for y in data: All.append(y) # print(All) if "book" not in All: # print(updateArticleALL[5]) return 1 else: # print(data) # print(data['doc']['joinToken']['creator']['name']) if int(articleDate[0:4])>int(data['doc']['joinToken']['creator']['updated_at'][0:4]): return 1 elif int(articleDate[5:7])>int(data['doc']['joinToken']['creator']['updated_at'][5:7]): return 1 elif int(articleDate[8:10])>int(data['doc']['joinToken']['creator']['updated_at'][8:10]): return 1 elif int(articleTime[0:2]+8) > int(data['doc']['joinToken']['creator']['updated_at'][11:13]): return 1 elif int(articleTime[3:5]) > int(data['doc']['joinToken']['creator']['updated_at'][14:16]): return 1 elif int(articleTime[6:8]) > int(data['doc']['joinToken']['creator']['updated_at'][17:19]): return 1 else: return data['doc']['joinToken']['creator']['name']
(這里應該設置時區,但方便期間+8計算)
下面這段代碼用于調整一個最好的格式用于發送郵件
def uploadArticleALL(ArticleALL): document = open("ArticleOneMonth.txt", "w+") content = [] b = '' content2 = [] for y in ArticleALL: # [[編號], [題目], [地址], [所屬WIKI], [更新日期],[時間], [作者]] y[5] = str(int(y[5][0:2]) + 8) + y[5][2:10] newUser = checkUser(y, y[4], y[5]) if (newUser) == 1: newUser = y[6] content2 = "WIKI編號:" + str(y[0]) + "" + "文章題目:" + y[1] + "" + "文章地址:" + y[2] + "" + "所屬WIKI:" + y[3] + "" + "更新日期:" + y[4] + "" + "更新時間:" + y[5] + "" + "創建者:" + y[6] + "" + "更新者:" + newUser + "" print(content2) content.append(content2) # print(b.join(content)) # sendEmail(b.join(content)) # print(b.join(content)) Result = b.join(content) document.write(Result) document.close() sendEmail("NULL", "file")
下面這段代碼用于實現在第二個功能中,我們需要獲取所有目錄中的文章相關參數
# 或者某個WIKI主頁的內容并返回更新過的文章def getWiki(updateWiki): updateWikiUrl = updateWiki[1] updateArticleALL = [] count = 0 for i in updateWikiUrl: # print(i) html = askUrl(i) # print(html) pattern = re.compile(r"decodeURIComponent\(\"(.+?)\"\)") # 正則表達式匹配 tags = re.findall(pattern, html) # s = urllib.parse.unquote(tags[0]) data = json.loads(urllib.parse.unquote(tags[0])) # print(data['book']['docs']) for y in data['book']['docs']: # print(y['title'], y['content_updated_at'][0:10]) if y['content_updated_at'][0:10] == Time: count += 1 # [[編號], [題目], [地址], [所屬WIKI], [更新時間], [作者]] updateArticle = [count, y['title'], i + '/'+y['slug'], updateWiki[0][count-1], y['content_updated_at'], y['user']['name']] updateArticleALL.append(updateArticle) # print(updateArticleALL) # 打印時間 # print(time.strftime('%Y-%m-%d', time.localtime(time.time()))) return updateArticleALL
這個函數用于第二個功能獲取某個知識庫中某個Wiki的相關參數
# 獲取某個Wiki全部內容def getAllWiki(Wiki, Year, Month): WikiUrl = Wiki[1] ArticleALL = [] count = 0 count2 = 0 for i in WikiUrl: # print(i) html = askUrl(i) pattern = re.compile(r"decodeURIComponent\(\"(.+?)\"\)") tags = re.findall(pattern, html) data = json.loads(urllib.parse.unquote(tags[0])) count += 1 for y in data['book']['docs']: if y['content_updated_at'][0:4] == Year: if y['content_updated_at'][6:7] == Month: count2 += 1 Article = [count2, y['title'], i + '/' + y['slug'], Wiki[0][count - 1], y['content_updated_at'][0:10], y['content_updated_at'][11:19], y['user']['name']] # Article = [y['title'], i + '/' + y['slug'], Wiki[0][count - 1]] ArticleALL.append(Article) return ArticleALL
下面就是發送郵件的代碼!
這里通過163郵箱
首先設置一下

新增授權密碼

之后通過函數實現,因為兩個功能都通過這個函數進行發送
但由于第二個功能輸出過多,所以采用附件的形式,這里通過采用判斷實現具體功能
# 發送郵箱def sendEmail(content, type): import smtplib from email.header import Header from email.mime.text import MIMEText # 第三方 SMTP 服務 mail_host = "XXXX" # SMTP服務器 mail_user = "XXXX" # 用戶名 mail_pass = "XXXX" # 授權密碼
sender = 'XXXX' # 發送方
receivers = ['XXXX'] # 接收方1 receivers2 = ['XXXX'] # 接收方2 receiver = receivers + receivers2
content = content title = Time+'WIKI文章' if type == "text": message = MIMEText(content, 'plain', 'utf-8') message['From'] = "{}".format(sender) message['To'] = ",".join(receivers) message['Subject'] = title elif type == "file":
# 創建一個帶附件的實例 message = MIMEMultipart() message['From'] = "{}".format(sender) message['To'] = ",".join(receivers) message['Subject'] = "一個月文章"
# 郵件正文內容 message.attach(MIMEText('一個月的文章請查收', 'plain', 'utf-8'))
# 構造附件1,傳送當前目錄下的 test.txt 文件 att1 = MIMEText(open('ArticleOneMonth.txt', 'rb').read(), 'base64', 'utf-8') att1["Content-Type"] = 'application/octet-stream' # 這里的filename可以任意寫,寫什么名字,郵件中顯示什么名字 att1["Content-Disposition"] = 'attachment; filename="ArticleOneMonth.txt"' message.attach(att1) try: smtpObj = smtplib.SMTP_SSL(mail_host, 465) smtpObj.login(mail_user, mail_pass) smtpObj.sendmail(sender, receiver, message.as_string()) print("mail has been send successfully.") except smtplib.SMTPException as e: print(e)
這里又有一個坑,這里我在設置顯示格式時設置了center,但這里需要html的居中
踩坑代碼如下
content1 = "文章題目".center(50 - 4, ' ') + "文章地址".center(50 - 4, ' ') + "作者名".center(50 - 3, ' ')content = []content.append(content1)c = ''b = ''for y in updateArticleALL: content2 = [] for i in y: content2.append(i.center(50 - len(re.findall('([\u4e00-\u9fa5])', i)), ' ')) content.append(c.join(content2))print(b.join(content))
實現每天都發送的函數如下:
def everyDay(): global Time Time = time.strftime('%Y-%m-%d', time.localtime(time.time())) # 判斷是否有更新 mainUrl = "https://www.yuque.com/api/groups/1542173/homepage?include_data=true" # 更新的文章的地址 updateWiki = judgeUpdate(mainUrl) # 獲取文章 updateArticleALL = getWiki(updateWiki)
# updateArticleUserALL = checkUser(updateArticleALL)
# 發送郵件 content = [] b = '' content2 = [] for y in updateArticleALL: y[4] = y[4][0:11] + str(int(y[4][11:13])+8) + y[4][13:19] newUser = checkUser(y, y[4][0:10], y[4][11:19]) if (newUser) == 1: newUser = y[5] # [[編號], [題目], [地址], [所屬WIKI], [更新時間], [作者]] content2 = "WIKI編號:" + str(y[0]) + "" + "文章題目:" + y[1] + "" + "文章地址:" + y[2] + "" + "所屬WIKI:" + y[3] + "" + "更新日期:" + y[4][0:10] + "" + "更新時間:" + y[4][11:19] + "" + "創建者:" + y[5] + "" + "更新者:" + newUser + "" content.append(content2) # print(b.join(content)) sendEmail(b.join(content), "text")
實現發送某個月的總文章如下
def oneMonth(Year, Month): mainUrl = "https://www.yuque.com/api/groups/1542173/homepage?include_data=true" html = askUrl(mainUrl) data = json.loads(html) WikiName = [] WikiUrl = [] Wiki = [] for x in data['data'][1]['placements'][0]['blocks'][0]['data'][0]['books']: WikiName.append(x['name']) WikiUrl.append('https://www.yuque.com/tidesec/'+x['slug']) Wiki.append(WikiName) Wiki.append(WikiUrl) ArticleALL = getAllWiki(Wiki, Year, Month) uploadArticleALL(ArticleALL)
主函數
def main(): print("Welcome To Use yuqueGetter") choose = 1 while choose: print("***** 1 每天發一次當天更新文章 *****") print("***** 2 選擇接收某個月的文章 *****") print("***** 0 退出 *****") choose = int(input("please choose the function:>")) if choose == 1: # schedule.every(1).minutes.do(everyDay) # 每分鐘,測試用 schedule.every().day.at("23:55").do(everyDay) while True: schedule.run_pending() time.sleep(1) elif choose == 2: Year = input("選擇年份(2020、2021、2022):>") Month = input("選擇月份(1、2 ... 11、12):>") oneMonth(Year, Month) elif (choose < 0) or (choose > 2): print("請重新輸入")
if __name__ == '__main__': main()
其中每天都定時發送的語句(已包含在代碼中)
schedule.every().day.at("23:55").do(everyDay)
最后,也可自行添加功能,例如爬取全部文章
def ALLWIKI(Year, Month): mainUrl = "https://www.yuque.com/api/groups/1542173/homepage?include_data=true" html = askUrl(mainUrl) data = json.loads(html) WikiName = [] WikiUrl = [] Wiki = [] for x in data['data'][1]['placements'][0]['blocks'][0]['data'][0]['books']: WikiName.append(x['name']) WikiUrl.append('https://www.yuque.com/tidesec/'+x['slug']) Wiki.append(WikiName) Wiki.append(WikiUrl) getAllWiki(Wiki)
0x03 實現效果
運行效果

保存文件中

發送方

第二個功能接收方效果


第一個功能接收方效果
