IT博客汇
  • 首页
  • 精华
  • 技术
  • 设计
  • 资讯
  • 扯淡
  • 权利声明
  • 登录 注册

    SCU教务处爬虫记录

    贺翔/CarOL发表于 2023-10-27 10:16:57
    love 0

    概述

    记录爬取scu教务处的事项。

    模拟登陆

    使用requests.session()保存爬虫状态

    首先需要进行登录:

    前往登录界面,F12打开network,输入错误的登录信息可以查看网络请求记录。

    真正有用的登录请求信息已经标出。

    只需要对照着General中的Request Method(POST)、Request Headers和Form Data

    使用requests.session进行请求即可。

    注意

    1. Form Data中的_spring_security_remember_me参数是装了基兄的scu插件才能看到,加上这个参数可以增加session的保存时间,不需要每次爬取信息都要登录一遍。
    2. Form Data中的j_password不是明文密码,而是MD5加密后的密码。
    3. Request Headers一定要打对,Cookie参数可以忽略。
    4. 验证码可以先用GET方法获取,再POST登陆。

    爬取姓名&头像

    爬取课表

    参考代码

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    import requests
    import re
    import hashlib
    from lxml import etree
    from PIL import Image
    from io import BytesIO

    class JWCSpider():
    def __init__(self, username, password):
    self.username = username
    self.password = self.__md5preFix(password)
    self.session = requests.session()
    self.headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Connection': 'keep-alive',
    'Host': 'zhjw.scu.edu.cn',
    'Referer': 'http://zhjw.scu.edu.cn/login',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Mobile Safari/537.36',
    'Origin':'http://zhjw.scu.edu.cn',
    }

    def __md5preFix(self, password: str) -> str:
    ins_md5 = hashlib.md5()
    ins_md5.update(password.encode('utf-8'))
    return ins_md5.hexdigest()

    def __get_captcha(self):
    url = 'http://zhjw.scu.edu.cn/img/captcha.jpg'
    captcha_img = self.session.get(url).content
    img = Image.open(BytesIO(captcha_img))
    img = img.convert('L')
    img.save('captcha.jpg')
    captcha = str(input('验证码:'))
    return captcha

    def __tofile(self, content: str, filename: str = 'format.html'):
    with open(filename, 'w', encoding='utf-8') as ofile:
    ofile.write(content)

    # 登录教务处
    def login_in(self) -> str:
    url = 'http://zhjw.scu.edu.cn/j_spring_security_check'
    data = {
    'j_username': self.username,
    'j_password': self.password,
    'j_captcha': self.__get_captcha()
    }
    try:
    response = self.session.post(url, data = data, headers=self.headers)
    except ConnectionError:
    return ('error', '网络连接登录错误')
    except TimeoutError:
    return ('error', '访问超时登录错误')

    isError = re.findall(r'errorCode=', response.content.decode('utf-8'))
    if isError: # 判断账号密码是否正确
    return ('error', '账号或者密码错误')
    else:
    return ('ok', '登录成功')

    def get_planCompletion(self):
    url = 'http://zhjw.scu.edu.cn/student/integratedQuery/planCompletion/index'
    # response = self.session.get(url, headers=self.headers)
    self.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
    self.headers['Referer'] = 'http://zhjw.scu.edu.cn/student/integratedQuery/course/courseSchdule/index'
    response = self.session.get(url, headers=self.headers)
    # print(re.text)
    self.__tofile(response.text)

    def get_courseTable(self):
    url = 'http://zhjw.scu.edu.cn/student/courseSelect/thisSemesterCurriculum/ajaxStudentSchedule/curr/callback'
    self.headers['Accept'] = '*/*'
    self.headers['Referer'] = 'http://zhjw.scu.edu.cn/student/courseSelect/thisSemesterCurriculum/index'
    self.headers['X-Requested-With'] = 'XMLHttpRequest'
    response = self.session.get(url, headers=self.headers)
    # print(re.text)
    self.__tofile(response.text, "coursetable.json")

    def get_courseSection(self):
    url = "http://zhjw.scu.edu.cn/ajax/getSectionAndTime"
    self.headers["Accept"] = "application/json, text/javascript, */*; q=0.01"
    self.headers["Referer"] = "http://zhjw.scu.edu.cn/student/courseSelect/thisSemesterCurriculum/index"
    self.headers['X-Requested-With'] = 'XMLHttpRequest'
    response = self.session.post(url, headers=self.headers, data={"planNumber":"", "ff": "f"})
    self.__tofile(response.text, "format.json")

    def get_studentPic(self):
    url = 'http://zhjw.scu.edu.cn/main/queryStudent/img?715.0'
    self.headers['Accept'] = 'image/avif,image/webp,image/apng,image/*,*/*;q=0.8'
    self.headers['Referer'] = 'http://zhjw.scu.edu.cn/student/courseSelect/thisSemesterCurriculum/index'
    stdPic = self.session.get(url, headers=self.headers).content
    img = Image.open(BytesIO(stdPic))
    img.save('student.jpg')

    def get_name(self):
    url = 'http://zhjw.scu.edu.cn/student/rollManagement/rollInfo/index'
    self.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
    self.headers['Referer'] = 'http://zhjw.scu.edu.cn/'
    response = self.session.get(url, headers=self.headers)
    res = re.findall(r'title=".*的照片', response.content.decode('utf-8'))
    print(res[0][7:].replace('的照片', ''))


    def main():
    check = JWCSpider('studentid', 'password')
    print(check.login_in())
    check.get_studentPic()
    check.get_courseSection()

    if __name__ == '__main__':
    main()



沪ICP备19023445号-2号
友情链接