# 爬虫

# requests

import requests
from lxml import etree

headers = {
    "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) ",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-us",
    "Connection": "keep-alive",
    "Accept-Charset": "GB2312,utf-8;q=0.7,*;q=0.7"
}


if __name__ == '__main__':
    session = requests.session()
    session.headers = headers
    url = ''
    response = session.get(url)
    html = etree.HTML(response.content.decode())
    titleElement = html.xpath(r'//title')[0]
    title = titleElement.text.split()[0]

一些工具函数:

from datetime import datetime
from os import mkdir
from os.path import exists

import requests
from lxml import etree

log_directory = 'log'

def get_filename_by_url(response: requests.Response) -> str:
    return response.url.split('/')[-1]


def get_extension_by_headers(response: requests.Response) -> str:
    content_type = response.headers['content-type']
    # content_type is like 'text/html;charset=UTF-8'
    return content_type.split(';')[0].split('/')[1]


def log_request_error(response: requests.Response):
    print(f'请求{response.request.url} 失败,返回值为{response.status_code}')
    print(f'请求头为{response.request.headers}')
    print(f'响应头为{response.headers}')
    if not exists(log_directory):
        mkdir(log_directory)
    t = datetime.today().strftime('%Y%m%d_%H%M%S')
    extension = get_extension_by_headers(response)
    if not extension:
        extension = 'log'

    filename = f"{log_directory}/{t}.{extension}"
    with open(filename, 'wb') as f:
        f.write(response.content)
    print(f'内容已写入至 {filename}。\n\n')

# selenium

from selenium import webdriver
driver = webdriver.Chrome()

url = "https://240334.yichafen.com/public/queryscore/sqcode/OsTcknwmMjkzfGZlYWYyOTQ1YjA5YmM1ZTQxOGQzMmY1NTdiYzNlYjI2fDI0MDMzNAO0O0OO0O0O.html"

def get_score(name: str, student_id: str) -> str:
    driver.implicitly_wait(1)      # 等待一秒
    driver.get(url)                # GET 指令
    driver.refresh()               # 刷新

    # 根据 XPath 寻找元素,XPath 可在 Chrome 检查元素中,对元素右键 - 复制 - 复制 XPath 得到
    student_id_element = driver.find_element_by_xpath("//input[@name='s_xuehao']")
    student_id_element.clear()                 # 清空输入框
    student_id_element.send_keys(student_id)   # 填入学号
    name_element = driver.find_element_by_xpath("//input[@name='s_xingming']")
    name_element.clear()
    name_element.send_keys(name)               # 填入姓名

    # 找到按钮
    button = driver.find_element_by_xpath("//*[@id='yiDunSubmitBtn']")
    button.click()                             # 单击按钮
    driver.implicitly_wait(0.5)                # 等待 0.5s

    try:
        assert(driver.find_element_by_xpath("//*[@id='result_content']/div[2]/table/tbody/tr[2]/td[1]").get_attribute("innerHTML") == student_id)
        assert(driver.find_element_by_xpath("//*[@id='result_content']/div[2]/table/tbody/tr[2]/td[2]").get_attribute("innerHTML") == name)
    except:
        raise ValueError("%s %s 信息不对应" %(name, student_id))
    
    # 找到需要的文本对应的 XPath,获取其文本
    return driver.find_element_by_xpath("//*[@id='result_content']/div[2]/table/tbody/tr[2]/td[3]").get_attribute("innerHTML")


if __name__ == '__main__':
    driver = webdriver.Chrome()
    with open('input.csv') as input:
        for line in input:
            name, student_id = line.strip().split(',')
            score = get_score(name, student_id)
            print("%s,%s,%s" % (name, student_id, score))
            with open('output.csv', 'w') as output:
                output.write("%s,%s,%s\n" % (name, student_id, score))

XPath 最常用的形式为 //input[@name='s_xuehao'],表示在全文中寻找类型为 input 且具有 name='s_xuehao' 的元素。

# 文件

def tranverse(dir: str):
    """
    遍历给定文件夹,并返回所有文件(不含文件夹)路径组成的数组
    """
    ret = []
    for root, dirs, files in os.walk(dir):
        for file in files:
            ret.append(os.path.join(root, file))
    return ret


def md5_file(file: str) -> str:
    """
    计算给定文件的 md5
    """
    with open(file, 'rb') as file:
        return hashlib.md5(file.read()).hexdigest()