爬取xupt课程表

查课表，以前用的是超级课程表，嫌乎广告太多，加上手机内存不够，卸了，自己写个课表。

由于学校教务系统的参数太多了，直接用了selenium来模拟登录，之后再用BeautifulSoup来分离参数。

流程：查看本地json格式的课表文件，若没有则爬并存储，若有则直接读取并输出。

做的时候发现单双周挺难弄的，最后是用了空间换时间的方法，把一个单双周合起来变成一个课表。

最后用Power Automate 简单做了一个桌面流：

结果：

附源代码：

from selenium import webdriver  # 启动浏览器需要用到
from selenium.webdriver.common.keys import Keys  # 提供键盘按键支持（最后一个K要大写）
from bs4 import BeautifulSoup
import time
import re
import json

#输入账号密码
student_id=""
studend_password=""
#输入浏览器内核位置，如F:\\chromedriver_win32\\chromedriver.exe
place="F:\\chromedriver_win32\\chromedriver.exe" 
#输入开学日期，格式：30 Aug 2021
time_to_strat = "30 Aug 2021"
#爬取后课表的存储位置
stored_place = "F:\\ProbeN1\\timetable\\"

#判断明天是周几,是不是单双周
start_time = time.strptime(time_to_strat, "%d %b %Y")
start_week = time.strftime("%U", start_time)
tomorrow_time_tamp = time.time()+86400
tomorrow_time = time.localtime(tomorrow_time_tamp)
tomorrow_week = time.strftime("%U", tomorrow_time)
weekin = int(tomorrow_week) - int(start_week) + 1
#print(weekin)  #单双周
dt = time.strftime("%A",tomorrow_time)
dt_ = {"Saturday":"6","Sunday":"0","Monday":"1","Tuesday":"2","Wednesday":"3","Thursday":"4","Friday":"5"}
tomorrow_weekday = dt_[dt]
#print(tomorrow_weekday) #周几
time_judge=[weekin,tomorrow_weekday]

def print_class(timetable,time_judge):
    print(timetable["flag"]["1"])
    if int(time_judge[1])==6 or int(time_judge[1])==0 :
        print("明天没课!")
    else :
        print("---------------------------------------------------")
        if (time_judge[0] % 2) == 0:
            tmp = timetable["双"][str(time_judge[1])]
            for each in tmp.values():
                for i in range (0,4):
                    print(each[i])
                print("---------------------------------------------------")
            
        elif (time_judge[0] % 2) == 1:
            tmp = timetable["单"][str(time_judge[1])]
            for each in tmp.values():
                for i in range (0,4):
                    print(each[i])
                print("---------------------------------------------------")

timetable={"单":   #课表
            {
                "1":{},"2":{},"3":{},"4":{},"5":{}
            },
            "双":
            {
                "1":{},"2":{},"3":{},"4":{},"5":{}
            },
            "flag":
            {
                "1":"","2":""
            }
        }

#开局先读取
try:
    a_json = json.load(open(stored_place+'file.txt','r'))
    if a_json["flag"]["2"] == "stored":
        print_class(a_json,time_judge)
except:
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_experimental_option("detach", True)
    chrome_options.add_argument('--headless')  #设置后台运行
    chrome_options.add_experimental_option('excludeSwitches', ['enable-logging']) #让终端少输出些东西
    driver=webdriver.Chrome(place,chrome_options=chrome_options)

    driver.get("http://www.zfjw.xupt.edu.cn/jwglxt/xtgl/login_slogin.html")

    elem_yhm = driver.find_elements_by_name("yhm") #查找并键入用户名,返回一个数组
    elem_yhm = elem_yhm[0]
    elem_yhm.clear()
    elem_yhm.send_keys(student_id)

    elem_mm = driver.find_elements_by_name("mm") #查找并键入密码
    elem_mm = elem_mm[1]
    elem_mm.clear()
    elem_mm.send_keys(studend_password)
    elem_mm.send_keys(Keys.RETURN)  # 模拟键盘回车

    driver.get("http://www.zfjw.xupt.edu.cn/jwglxt/kbcx/xskbcx_cxXskbcxIndex.html?gnmkdm=N2151&layout=default&su=" + student_id) #直接get数据库界面
    time.sleep(2)   #等待页面内的apex返回,否则soup会抓得太快
    soup = BeautifulSoup(driver.page_source, "html.parser") 
    allclass_soup = soup.find("table",{"id":"kbgrid_table_0"})  # 获取kbgrid_table_0标签下的所有课程,allcalss是个TAG类

    #找到用户的名字,(学号:student_id)
    find_student_name = re.compile(r"[\u4e00-\u9fa5]+的课表")
    student_name_soup = soup.find("div",{"class":"timetable_title"})
    student_name = find_student_name.findall(str(student_name_soup))[:-3] #强制转化可以把TAG类转化为str,但是我感觉不是很完美

    try :
        print(student_name+"的课表")
    except :
        print("未找到名称")

    content = []
    find_inner_html = re.compile(r"\'[^\']*\'")
    for week in range(1,6):
        for classtime in range(1,12):
            each_class = soup.find("td",{"id":str(week)+"-"+str(classtime)})   #找td标签

            if each_class is None : #跳过空类型
                continue 

            each_class = each_class.findAll("font")

            if each_class is None : #跳过空类型
                continue 

            for each in each_class:  #获取td标签里的内容
                for child in each.children:
                    name =find_inner_html.findall(str(child.unwrap))
                    if name:
                        content.append(name[0][1:-1])

            if content:  #添加课表
                if len(content) == 11:
                    timetable["单"][str(week)].update({str(classtime):content})
                    timetable["双"][str(week)].update({str(classtime):content})
                elif len(content) == 22:
                    timetable["单"][str(week)].update({str(classtime):content[:11]})
                    timetable["双"][str(week)].update({str(classtime):content[11:]})
            content = []
        
    #写入文件
    with open(stored_place+'file.txt', 'w') as file:
        timetable["flag"]["1"] = student_name
        timetable["flag"]["2"] = "stored"
        file.write(json.dumps(timetable)) 

    print_class(timetable,time_judge)
#print(timetable)

当然，之后也新建一个bat帮助自动执行

f:
cd F:\ProbeN1\timetable
python3 timetable.py
pause

然后再在windows里设置