i春秋视频爬虫
最近买了一点网课需要下载下来方便观看,网上找到了一个脚本:
https://blog.csdn.net/a854596855/article/details/114901485
这个大哥写的很棒,思路也不错,基本上把原理都讲透了。
但是使用过程中还是遇到了一些问题:
课程列表无法解析(可能是我要爬的课程页面比较清奇,和一般的课程不太一样)。
AES解密有时候会出问题,导致最后拿到的视频是坏的(每小时50分左右出现,持续大概十分钟,这一段时间里拿到的视频都是损坏的。这个没找到原因,跟着大哥的方法走了一遍加密逻辑发现没什么问题,看了一下拿到的16进制密钥有时候会出现其他字符,可能还有加密逻辑没发现?)。
对上面大哥的脚本做了一点小改动:
手动指定每个课程的URL。
对下载完的视频进行校验,文件损坏则重新下载。(相当于每个小时卡顿十分钟,但是好歹保证最后拿到的视频都是可以直接看的,相当于牺牲效率换取一部分质量吧)
下面是我修改的脚本:
get_aes_key.py
#-*- coding:utf-8 -*- import base64 from PIL import Image #全局变量 import os response = {} def n(e,t): #print(e,type(e)) #print(t,type(t)) i = '' r = len(t) for a in range(len(e)): n = a % r i += chr(ord(e[a]) ^ ord(t[n])) return i def t(e, t): i = len(e) r = len(t) a = '' for o in range(i): a += n(chr(e[o]), t[o % r]) return a def r(e, t): i = int(e,16) r = -1 a = - 1 for o in range(36): for s in range(32): if i == t[o][s]: r = o a = s break if (r >= 0 and a >= 0): break return hex((7 * r + a) % 255)[2:] def i(): e=180 t,n,i = map(lambda x: int(x),response['t'].split('-')) r = 3600 * t + 60 * n + i a = r % e return int((r - a) / e % 128) def e(e, n): a = i() o = hex(a)[2:].encode('utf-8') s = e[a%32:a%32+2] #print(a,o,s) u = e.replace(b'',s) l = e[:a % 32] c = e[a%32:a%32+len(e)] c = c.replace(c[:2], b'') u = l + c d = '' h = '' p = 0 for p in range(0,len(u),2): f = u[p:p+2] h = r(o, n) if '' == h else r(h, n) d += t(f, h) return d def a(e): t = e n = t[0] i = t[1] r = t[2] return int(hex((1 << 24) + (n << 16) + (i << 8) + r)[2:][1:],16) #将base64编码的图片,保存为1.png def save2png(): png_bs64_string = 'iVBORw0KGgoAAAANSUhEUgAAACQAAAAgCAIAAAD1803ZAAAIjElEQVRIiQXBC/jQ470A8M/3fX8lEbqr9Pjz15nr5FahMXK/XyK3x8gOsgdzmnOwqdlYbTw2dhzLM+cQndxqJSfMY2nluBapZJnb3CfpItnx/73v+XyIRr5Xs6N0IVeJtdwkXylmyyvEm/IIjhe7iG/oRR9xuqa/JmtO0QyWxmu2lP+Z16WR0iJ+LZ4SP2K+WKdp5Fv5JrhfXqNdy8XSvdyt9lKvkHZRLtT0UzvV55WNYgM/U2fJF6vdlXGaHurh4hX6a88XS8RftF/JvbQd4i35LV3viSniTnEceb3UUzNJHCY2imHyWLmIt/mC77KDNE06QXOx+Ezq4HGOlPrKjfgeIX/JC/IW8rPSn6QzWSkdLE8TK8QD0ih5q0Z7i/yJFteo6zU7sUA7UH1Ec772Cenv2qniB+owVqozxHxBDGAP8TLfZ5P8qHaDfDl70FOaIeZwjVjFMPG1GJhYzptqN2maOFb5rnYejdhV+5F0j7hM2lP9q3qg2IoPuV56jqKOUWeqXWwWmzXHqt2VpzleTFQ/1d6lHMMMdYQyKrFGncJ/aj+SW3G4VNS+opfaXfu59t/YnVC7lIXqcYzWrlZeE8SZ0mz1A/U67QIe5Yeir9hN+rm8RPodl3KDGjiWa8UX8h6a63hGOlFqWcGjLJDekI/hKV6TQu4QV0n7yZs1R3OW/BR7SL2l06SPRJd8rHwzp4kDmCDPEuOkZY1YoJ5MpzpBfUvaRixRekvnSE/yMYdptxR7s0n8XPup+Fw6Qp3HXpq3db3LE8o46US1UzpSPVRcSTf1UOkWZZ16PPskxmg+FKPFS8psdZL6e3VnZuuarN6o9hWNdKI4UrypKeIX6t3Kb5SsjpO/zRBRxQBeFd2kZeoJdMhbiDNZIA2Svx3SbsoGuR9DlAHqIdIKZqkDpWPUPgyVbtc1XapigfZWeX+Wa0/SPEQnuysXiINIyq7SZLU3G5SRXCIvZjoHEDSd4gmZdJkmSRulb8lzNJdIh0mf8Lb0mThEPC1Olu/g/+Tx8iua8WK5tEqaKY2T75Yf5mDpffkszhGTpLO5T8yV/9Zou5gidtB+IQZIh+g6QzpUeU6axlSI9/hCuZEFynr5XPVTUdXJojBVfYZX1cv4RCwWb4tO6VppkHq9/KLyonb7hk4eVrcVl2mHM5LhymB5qLKDOopfa/vKW7FRWq8+wI/Uc5Wl0gvqntI+ykx5vTJfXam5kC919ZbeVPfS/l5MUKdKncQysVR6VTwodRNjxcfiE3GddChrNQeKn0jD5Y2a7tIasRM96SbeF/dohkqXS/vKSR4mLed0eYpmlDxGni+/JJaII4IBzOd0eYM4Ue2n7E4PzpOOV8apU+U/KsvEEmVHVosNhPRfykHic3E/76qHK9eIU6VnlLP5TDpS+4Q0R9lP7lAXZ02WPpBGspt2IR+rP2au1F19UixWt+crLuG38gvSVLFK3Zk16itMYJo6Tj1WHqLOoI96IWfyqTSXu6SXWS2uTcqZuvbVDlOHSAuZLZYxVtlTmsomzXS1uzxS2lX5lvYmthOLWCB1k06R+kuHcba2J7coywjpUSYpQ6Uz1L7qOvU3IR+gPUTaTX1IHsJf2UY7SH5Ee7k0QDtBeln5XDpQzeoIuZ/2delXyr7ierW31EPpIT+s7ZDXajvkS5UBYrTyGR+Kd6Snk9ggvyZ2kQeqF6ifaZ+VRut6T3pfnSvvrA5mC+VfuUTTTxws/4T18k+l5eIi9pAW84hYp72aB7X7CMrvpF2kHeVZbMxivNrBIO0Z6vvycsaod7JB/gc3KfPVOfJAdXt5Ksexl1rFR+IN5QaxSd1Wupc+Yh3TpJ3Ul9Tvyaeox9GhLJKnJ+UmMVbppJM3tO9yEltLY5X9xJFisRiqnCIPUx5X7lFXig71RV0/UH+m/l3qUGaznkZdzijRSsu1R6i3Sb2ZpcwM+XzlWXE7PZReTGCk/Jx6JR9L8+ij7qmdK70i9dEu5XaxnPUqTmWW1Cme106Vr1VmK39mjrhO/VpuWKd8qQ7PYm9ppjJMbcTOmifFcawQL3Ka9gWpU/tjebCYQZVu4yKJ8hJHq+9ym/Rb8U/KDaxWnpXule6QprFEmqXdX95RvT9zkLpBGqBulhaoI5WjxGB2Uo8RH0j/UPaSZ2kfkW5UB/GoukodKf1CXqFMlvvqOkPaTV0szxTvqN9Rz1POkf4otlI/UR9qxHvSAPUCaa26Vh3ADOaJL5VL5VvUVWJ/5TL5LmWhtJf6ovLvTFLf1TVY/oPyurxK3Ul8LfZWBivHiK9EH/Ux5Q7+W/5+lkbQqDer3XhZzNQ8xpZijTpNadggtlYmin7S3tqhYrh0NpdK94np0jfqZKWLbXlQHSvtI8ZJq8XVyjL+wAfKdqHZStmsLJWOVjaJk+QnlEXqSOku7Z/FoWKiSOpUcZ+6mSnSFdpDxWhlvdRfHc5K8aG4iO21R4mBjBXviKp9TVOUG7IyXj1dHsM8rpCOUjql98Uc7Qmig3nSfG5WjpA3Ko+LQQp1trSZ9WK8fDkvcSq7Sr2V0zXPqP8r7af9WmyUZmq/SuwsBmsXiCvktWovaYn6K3GetI08Vx6h/UQ5Q8xXxqlvK3+SNspdDONO9UNliKjqImW5rnmaq/gb65T/oL+4Wukjtk7SRDGRhepZyo08KQaLL7XfUQbrel2bNOOZKK8WszRna0YpSdtTfU8cSBab1OOlSfK/yP21P2Upo8VKcYN4nhPVEaHp0H4uX6D2VKvoqb1P3lb5VCxSnpdPVhaKi9go5qoTxIF0KoO4VRypvM8KsZ6HxVJldzFbPKa00mTxkLa7vEZ9OWkvV+/XdQit+j/aL8Sl2r9Ityg9xBTlWbGLOEzNylqpB1+p5/KGepM6iJflq8R08YDyQ+mX6oPSdpodxEnqwZptGSNt9/+pIKYt7f3+lgAAAABJRU5ErkJggg==' image_data = base64.b64decode(png_bs64_string) print(1122) with open('1.png', 'wb') as f: f.write(image_data) print(1111) #根据response的结果和1.png生成密钥 def get_aes_key(res): save2png() global response response = res img=Image.open("1.png") img_array=img.load() width,height = img.size data = [] for i in range(width): i_v = [] for j in range(height): i_v.append(a(img_array[i,j])) data.append(i_v) os.system('rm -rf 1.png') return (e(base64.b64decode(response['data']), data))
download_ichunqiu.py
#-*- coding:utf-8 -*- import requests from requests.packages import urllib3 from Crypto.Cipher import AES import re from bs4 import BeautifulSoup from get_aes_key import get_aes_key import os,sys import time import cv2 urllib3.disable_warnings() urls=["https://www.ichunqiu.com/train/course/detail/293/66536","https://www.ichunqiu.com/train/course/detail/293/66537","https://www.ichunqiu.com/train/course/detail/293/66539","https://www.ichunqiu.com/train/course/detail/293/66540","https://www.ichunqiu.com/train/course/detail/293/66542","https://www.ichunqiu.com/train/course/detail/293/66544","https://www.ichunqiu.com/train/course/detail/293/66546","https://www.ichunqiu.com/train/course/detail/293/66547","https://www.ichunqiu.com/train/course/detail/293/66554","https://www.ichunqiu.com/train/course/detail/293/66556","https://www.ichunqiu.com/train/course/detail/293/66561","https://www.ichunqiu.com/train/course/detail/293/66627","https://www.ichunqiu.com/train/course/detail/293/66628","https://www.ichunqiu.com/train/course/detail/293/66629","https://www.ichunqiu.com/train/course/detail/293/66630","https://www.ichunqiu.com/train/course/detail/293/66632","https://www.ichunqiu.com/train/course/detail/293/66633","https://www.ichunqiu.com/train/course/detail/293/66635","https://www.ichunqiu.com/train/course/detail/293/66640","https://www.ichunqiu.com/train/course/detail/293/66642","https://www.ichunqiu.com/train/course/detail/293/66644","https://www.ichunqiu.com/train/course/detail/293/66646","https://www.ichunqiu.com/train/course/detail/293/66637","https://www.ichunqiu.com/train/course/detail/293/66647","https://www.ichunqiu.com/train/course/detail/293/66674","https://www.ichunqiu.com/train/course/detail/293/66676","https://www.ichunqiu.com/train/course/detail/293/66679","https://www.ichunqiu.com/train/course/detail/293/66681","https://www.ichunqiu.com/train/course/detail/293/66682","https://www.ichunqiu.com/train/course/detail/293/66684","https://www.ichunqiu.com/train/course/detail/293/66686","https://www.ichunqiu.com/train/course/detail/293/66698","https://www.ichunqiu.com/train/course/detail/293/66699","https://www.ichunqiu.com/train/course/detail/293/66700","https://www.ichunqiu.com/train/course/detail/293/66702","https://www.ichunqiu.com/train/course/detail/293/66704","https://www.ichunqiu.com/train/course/detail/293/66705","https://www.ichunqiu.com/train/course/detail/293/66710","https://www.ichunqiu.com/train/course/detail/293/66711","https://www.ichunqiu.com/train/course/detail/293/66712","https://www.ichunqiu.com/train/course/detail/293/66713","https://www.ichunqiu.com/train/course/detail/293/66714","https://www.ichunqiu.com/train/course/detail/293/66715","https://www.ichunqiu.com/train/course/detail/293/66716","https://www.ichunqiu.com/train/course/detail/293/66718","https://www.ichunqiu.com/train/course/detail/293/66719","https://www.ichunqiu.com/train/course/detail/293/66720","https://www.ichunqiu.com/train/course/detail/293/66721","https://www.ichunqiu.com/train/course/detail/293/66722","https://www.ichunqiu.com/train/course/detail/293/66723","https://www.ichunqiu.com/train/course/detail/293/66724","https://www.ichunqiu.com/train/course/detail/293/66725","https://www.ichunqiu.com/train/course/detail/293/66726","https://www.ichunqiu.com/train/course/detail/293/66728","https://www.ichunqiu.com/train/course/detail/293/66729","https://www.ichunqiu.com/train/course/detail/293/66731","https://www.ichunqiu.com/train/course/detail/293/66732","https://www.ichunqiu.com/train/course/detail/293/66733","https://www.ichunqiu.com/train/course/detail/293/66734","https://www.ichunqiu.com/train/course/detail/293/66735","https://www.ichunqiu.com/train/course/detail/293/66736","https://www.ichunqiu.com/train/course/detail/293/68288","https://www.ichunqiu.com/train/course/detail/293/68289","https://www.ichunqiu.com/train/course/detail/293/68290","https://www.ichunqiu.com/train/course/detail/293/68291","https://www.ichunqiu.com/train/course/detail/293/66902","https://www.ichunqiu.com/train/course/detail/293/66903","https://www.ichunqiu.com/train/course/detail/293/66904","https://www.ichunqiu.com/train/course/detail/293/66905","https://www.ichunqiu.com/train/course/detail/293/66906","https://www.ichunqiu.com/train/course/detail/293/66907","https://www.ichunqiu.com/train/course/detail/293/66908","https://www.ichunqiu.com/train/course/detail/293/66909","https://www.ichunqiu.com/train/course/detail/293/66910","https://www.ichunqiu.com/train/course/detail/293/66911","https://www.ichunqiu.com/train/course/detail/293/66912","https://www.ichunqiu.com/train/course/detail/293/66913","https://www.ichunqiu.com/train/course/detail/293/66914","https://www.ichunqiu.com/train/course/detail/293/66915","https://www.ichunqiu.com/train/course/detail/293/66916","https://www.ichunqiu.com/train/course/detail/293/66917","https://www.ichunqiu.com/train/course/detail/293/66918","https://www.ichunqiu.com/train/course/detail/293/66919","https://www.ichunqiu.com/train/course/detail/293/66920","https://www.ichunqiu.com/train/course/detail/293/66921","https://www.ichunqiu.com/train/course/detail/293/66922","https://www.ichunqiu.com/train/course/detail/293/66923","https://www.ichunqiu.com/train/course/detail/293/66924","https://www.ichunqiu.com/train/course/detail/293/66925","https://www.ichunqiu.com/train/course/detail/293/66928","https://www.ichunqiu.com/train/course/detail/293/66929","https://www.ichunqiu.com/train/course/detail/293/66930","https://www.ichunqiu.com/train/course/detail/293/66931","https://www.ichunqiu.com/train/course/detail/293/66932","https://www.ichunqiu.com/train/course/detail/293/66933","https://www.ichunqiu.com/train/course/detail/293/66935","https://www.ichunqiu.com/train/course/detail/293/66936","https://www.ichunqiu.com/train/course/detail/293/66938","https://www.ichunqiu.com/train/course/detail/293/66939","https://www.ichunqiu.com/train/course/detail/293/66940","https://www.ichunqiu.com/train/course/detail/293/66942","https://www.ichunqiu.com/train/course/detail/293/66943","https://www.ichunqiu.com/train/course/detail/293/66944","https://www.ichunqiu.com/train/course/detail/293/66945","https://www.ichunqiu.com/train/course/detail/293/66946","https://www.ichunqiu.com/train/course/detail/293/66947","https://www.ichunqiu.com/train/course/detail/293/66948","https://www.ichunqiu.com/train/course/detail/293/66949","https://www.ichunqiu.com/train/course/detail/293/66950","https://www.ichunqiu.com/train/course/detail/293/66951","https://www.ichunqiu.com/train/course/detail/293/66952","https://www.ichunqiu.com/train/course/detail/293/66953","https://www.ichunqiu.com/train/course/detail/293/66954","https://www.ichunqiu.com/train/course/detail/293/66955","https://www.ichunqiu.com/train/course/detail/293/66956","https://www.ichunqiu.com/train/course/detail/293/67121","https://www.ichunqiu.com/train/course/detail/293/67122","https://www.ichunqiu.com/train/course/detail/293/67123","https://www.ichunqiu.com/train/course/detail/293/67124","https://www.ichunqiu.com/train/course/detail/293/67125","https://www.ichunqiu.com/train/course/detail/293/67126","https://www.ichunqiu.com/train/course/detail/293/67127","https://www.ichunqiu.com/train/course/detail/293/67128","https://www.ichunqiu.com/train/course/detail/293/67129","https://www.ichunqiu.com/train/course/detail/293/67130","https://www.ichunqiu.com/train/course/detail/293/67131","https://www.ichunqiu.com/train/course/detail/293/67132","https://www.ichunqiu.com/train/course/detail/293/67133","https://www.ichunqiu.com/train/course/detail/293/67134","https://www.ichunqiu.com/train/course/detail/293/67135","https://www.ichunqiu.com/train/course/detail/293/67136","https://www.ichunqiu.com/train/course/detail/293/67137","https://www.ichunqiu.com/train/course/detail/293/67138","https://www.ichunqiu.com/train/course/detail/293/67139","https://www.ichunqiu.com/train/course/detail/293/67140","https://www.ichunqiu.com/train/course/detail/293/68465"] #这里放每条视频的地址,一般的网课我觉得也可按上面大哥的方法来自动获取,这里我没买其他的课没法测试 #全局变量,requests请求所使用headers字段 headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0' } #全局变量,保存requests会话 s = requests.Session() #用来扩展AES密钥,如果value的长度不足16个字节,在其后面填充0 def fill_character(value): if len(value) < 16: value = value.ljust(16, '\000') elif len(value) > 16: value = value[:16] return value #获取视频加密所使用的AES密钥 def get_key(video_id): key_url= 'https://www.ichunqiu.com/video/key/%s' % video_id res1 = s.get(key_url,headers=headers,verify=False) if res1.status_code == 200: response = res1.json() #根据response中的值来生成密钥 return get_aes_key(response) return '' #根据iv和key对ts视频内容进行解密 def decrypt_single_ts(ts,iv_str,key_str): #将iv由十六进制字符串转化为byte ivs = bytes.fromhex(iv_str) #将密钥由十六进制字符串转化为byte key = bytes.fromhex(key_str) #计算需要填充的字节长度 pad_len = AES.block_size - len(ts) % AES.block_size #若ts长度不是的AES要求的分组长度的整数倍,对其填充0 if pad_len != AES.block_size: ts = ts[:-pad_len] + bytes([0] * pad_len) #解密操作 cipher = AES.new(key, AES.MODE_CBC, ivs) out_data = cipher.decrypt(ts) #从解密结果中去掉填充部分,得到ts的解密内容 if pad_len != AES.block_size: out_data = out_data[:-pad_len] return out_data #根据m3u8视频流的url,爬取视频内容 #m3u8_url为m3u8的地址,title为视频保存的文件名 def handle_m3u8_data(m3u8_url,title): res = s.get(m3u8_url,headers=headers,verify=False) if res.status_code == 200: #这是一个文本文件 data = res.text.strip() #print data #使用正则表达式,提取出加密方法,视频id,和iv aes_method,video_id,iv_str = re.findall(r'#EXT-X-KEY:METHOD=(.*?),URI="http:\/\/www.ichunqiu.com\/videokey\?vid=(\d+)",IV=0x(.*?)\n',data)[0] #使用正则表达式提取来ts的uri ts_uri_list = re.findall(r'(\d+.ts)\n',data) #根据video_id获取解密密钥 key_str = get_key(video_id) print("KEY:",key_str) print("IV:",iv_str) content=b'' #下载所有的ts文件 for ts_url in ts_uri_list: #构造完成的url url_base = m3u8_url[:m3u8_url.rfind('/')+1] res1 = s.get(url_base+ts_url,headers=headers,verify=False) if res1.status_code == 200: #对ts的内容进行解密,并依次拼接成一个完整的文件 content += decrypt_single_ts(res1.content,iv_str,key_str) #保存输出文件 open('%s' % title,'wb').write(content) #爬虫主函数 def spider(): global s #登陆url url = 'https://user.ichunqiu.com/login/normal' #登陆信息 data = { 'redirect_url':"https://www.ichunqiu.com/", 'appid':'5af018bda55004e1', 'account':'', 'password':'', 'captcha':'', 'mt':'1577261775197', 'rs':'e8c48853ab79882bc54ef7f6b5452c98' } res = s.post(url=url,data=data,headers=headers,verify=False) if res.status_code == 200: #若登陆成功 i=1 if res.json()['code'] == 0: for keshi_url in urls: flag=0 if(i < 80): i=i+1 continue while(flag==0): try: res2 = s.get(keshi_url,headers=headers,verify=False) if res2.status_code == 200: m3u8_url1 = re.findall(r'data-video-url="(https.*?m3u8)',res2.text)[0] index = m3u8_url1.rfind('/') m3u8_url = m3u8_url1[:index+1]+'720/'+m3u8_url1[index+1:] print(m3u8_url) os.system('touch '+str(i)+'.mp4') filename = '/home/ubuntu/ichunqiu/%s.mp4' % (str(i)) handle_m3u8_data(m3u8_url,filename) print('下载完毕') vid=cv2.VideoCapture(filename) if not vid.isOpened(): #这边校验拿到的视频 print('文件损坏') time.sleep(10) else: i=i+1 flag=1 except: flag=0 print(i,"文件损坏、密钥损坏") time.sleep(10) spider()
放在服务器上挂了一天终于爬完了真不错