# 爬虫
# requests
import requests
from lxml import etree
headers = {
"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) ",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-us",
"Connection": "keep-alive",
"Accept-Charset": "GB2312,utf-8;q=0.7,*;q=0.7"
}
if __name__ == '__main__':
session = requests.session()
session.headers = headers
url = ''
response = session.get(url)
html = etree.HTML(response.content.decode())
titleElement = html.xpath(r'//title')[0]
title = titleElement.text.split()[0]
一些工具函数:
from datetime import datetime
from os import mkdir
from os.path import exists
import requests
from lxml import etree
log_directory = 'log'
def get_filename_by_url(response: requests.Response) -> str:
return response.url.split('/')[-1]
def get_extension_by_headers(response: requests.Response) -> str:
content_type = response.headers['content-type']
# content_type is like 'text/html;charset=UTF-8'
return content_type.split(';')[0].split('/')[1]
def log_request_error(response: requests.Response):
print(f'请求{response.request.url} 失败,返回值为{response.status_code}')
print(f'请求头为{response.request.headers}')
print(f'响应头为{response.headers}')
if not exists(log_directory):
mkdir(log_directory)
t = datetime.today().strftime('%Y%m%d_%H%M%S')
extension = get_extension_by_headers(response)
if not extension:
extension = 'log'
filename = f"{log_directory}/{t}.{extension}"
with open(filename, 'wb') as f:
f.write(response.content)
print(f'内容已写入至 {filename}。\n\n')
# selenium
- Selenium Webdriver 简易教程 - wizardforcel (opens new window)
- Selenium with Python中文翻译文档 (opens new window)
- ChromeDriver (opens new window)
from selenium import webdriver
driver = webdriver.Chrome()
url = "https://240334.yichafen.com/public/queryscore/sqcode/OsTcknwmMjkzfGZlYWYyOTQ1YjA5YmM1ZTQxOGQzMmY1NTdiYzNlYjI2fDI0MDMzNAO0O0OO0O0O.html"
def get_score(name: str, student_id: str) -> str:
driver.implicitly_wait(1) # 等待一秒
driver.get(url) # GET 指令
driver.refresh() # 刷新
# 根据 XPath 寻找元素,XPath 可在 Chrome 检查元素中,对元素右键 - 复制 - 复制 XPath 得到
student_id_element = driver.find_element_by_xpath("//input[@name='s_xuehao']")
student_id_element.clear() # 清空输入框
student_id_element.send_keys(student_id) # 填入学号
name_element = driver.find_element_by_xpath("//input[@name='s_xingming']")
name_element.clear()
name_element.send_keys(name) # 填入姓名
# 找到按钮
button = driver.find_element_by_xpath("//*[@id='yiDunSubmitBtn']")
button.click() # 单击按钮
driver.implicitly_wait(0.5) # 等待 0.5s
try:
assert(driver.find_element_by_xpath("//*[@id='result_content']/div[2]/table/tbody/tr[2]/td[1]").get_attribute("innerHTML") == student_id)
assert(driver.find_element_by_xpath("//*[@id='result_content']/div[2]/table/tbody/tr[2]/td[2]").get_attribute("innerHTML") == name)
except:
raise ValueError("%s %s 信息不对应" %(name, student_id))
# 找到需要的文本对应的 XPath,获取其文本
return driver.find_element_by_xpath("//*[@id='result_content']/div[2]/table/tbody/tr[2]/td[3]").get_attribute("innerHTML")
if __name__ == '__main__':
driver = webdriver.Chrome()
with open('input.csv') as input:
for line in input:
name, student_id = line.strip().split(',')
score = get_score(name, student_id)
print("%s,%s,%s" % (name, student_id, score))
with open('output.csv', 'w') as output:
output.write("%s,%s,%s\n" % (name, student_id, score))
XPath 最常用的形式为 //input[@name='s_xuehao']
,表示在全文中寻找类型为 input
且具有 name='s_xuehao'
的元素。
# 文件
def traverse(dir: str):
"""
遍历给定文件夹,并返回所有文件(不含文件夹)路径组成的数组
"""
ret = []
for root, dirs, files in os.walk(dir):
for file in files:
ret.append(os.path.join(root, file))
return ret
# 手写递归版
# print(traverse('.', ['tsx'], ['node_modules']))
def traverse(dir_path: str, file_pattern: list, ignored_directory: list) -> list:
"""
遍历给定文件夹,并返回所有文件(不含文件夹)路径组成的数组
"""
total = []
filenames = os.listdir(dir_path)
for filename in filenames:
full_path = os.path.join(dir_path, filename)
if os.path.isdir(full_path):
# is a directory
if filename not in ignored_directory:
total += traverse(full_path, file_pattern, ignored_directory)
else:
# is a file
if filename.split('.')[-1] in file_pattern:
total.append(full_path)
return total
def md5_file(file: str) -> str:
"""
计算给定文件的 md5
"""
with open(file, 'rb') as file:
return hashlib.md5(file.read()).hexdigest()
# 多线程和协程
# 多线程
def func(a, b, c)
print(a, b, c)
threads = []
for i in range(1000000):
t = threading.Thread(target=func, args=(a, b, c))
threads.append(t)
t.start()
for thread in threads:
thread.join()
# 带锁的多线程:
lock = theading.Lock()
with lock:
a += 1
# 协程
async def func(a, b, c):
print(a, b, c)
async def main():
tasks = []
tasks.append(asyncio.create_task(func(a, b, c)))
tasks.append(asyncio.create_task(func(a, b, c)))
await asyncio.gather(*tasks)
asyncio.run(main())
# 带锁的协程:
lock = asyncio.Lock()
with lock:
a += 1