当前位置:首页 > DayDayUp > 正文内容

i春秋视频爬虫

Luz3周前 (09-25)DayDayUp177

最近买了一点网课需要下载下来方便观看,网上找到了一个脚本:

https://blog.csdn.net/a854596855/article/details/114901485

这个大哥写的很棒,思路也不错,基本上把原理都讲透了。


但是使用过程中还是遇到了一些问题:

  1. 课程列表无法解析(可能是我要爬的课程页面比较清奇,和一般的课程不太一样)。

  2. AES解密有时候会出问题,导致最后拿到的视频是坏的(每小时50分左右出现,持续大概十分钟,这一段时间里拿到的视频都是损坏的。这个没找到原因,跟着大哥的方法走了一遍加密逻辑发现没什么问题,看了一下拿到的16进制密钥有时候会出现其他字符,可能还有加密逻辑没发现?)。


对上面大哥的脚本做了一点小改动:

  1. 手动指定每个课程的URL。

  2. 对下载完的视频进行校验,文件损坏则重新下载。(相当于每个小时卡顿十分钟,但是好歹保证最后拿到的视频都是可以直接看的,相当于牺牲效率换取一部分质量吧)


下面是我修改的脚本:

get_aes_key.py

#-*- coding:utf-8 -*-
import base64
from PIL import Image
#全局变量
import os
response = {}

def n(e,t):
    #print(e,type(e))
    #print(t,type(t))
    i = ''
    r = len(t)

    for a in range(len(e)):
        n = a % r
        i += chr(ord(e[a]) ^ ord(t[n]))
    return i

def t(e, t):
    i = len(e)
    r = len(t)
    a = ''
    for o in range(i):
        a += n(chr(e[o]), t[o % r])
    return a


def r(e, t):
    i = int(e,16)
    r = -1
    a = - 1

    for o in range(36):
        for s in range(32):
            if i == t[o][s]:
                r = o
                a = s
                break
        if (r >= 0 and a >= 0):
            break
    
    return hex((7 * r + a) % 255)[2:]


def i():
    e=180
    t,n,i = map(lambda x: int(x),response['t'].split('-'))
    r = 3600 * t + 60 * n + i
    a = r % e
    return int((r - a) / e % 128)

def e(e, n):
    a = i()
    o = hex(a)[2:].encode('utf-8')
    s = e[a%32:a%32+2]
    #print(a,o,s)

    u = e.replace(b'',s)
    l = e[:a % 32]
    c = e[a%32:a%32+len(e)]
    c = c.replace(c[:2], b'')

    u = l + c
    d = ''
    h = ''
    p = 0

    for p in range(0,len(u),2):
        f = u[p:p+2]
        h = r(o, n) if '' == h else r(h, n)
        d += t(f, h)
    return d

def a(e):
    t = e
    n = t[0]
    i = t[1]
    r = t[2]
    return int(hex((1 << 24) + (n << 16) + (i << 8) + r)[2:][1:],16)

#将base64编码的图片,保存为1.png
def save2png():
    png_bs64_string = 'iVBORw0KGgoAAAANSUhEUgAAACQAAAAgCAIAAAD1803ZAAAIjElEQVRIiQXBC/jQ470A8M/3fX8lEbqr9Pjz15nr5FahMXK/XyK3x8gOsgdzmnOwqdlYbTw2dhzLM+cQndxqJSfMY2nluBapZJnb3CfpItnx/73v+XyIRr5Xs6N0IVeJtdwkXylmyyvEm/IIjhe7iG/oRR9xuqa/JmtO0QyWxmu2lP+Z16WR0iJ+LZ4SP2K+WKdp5Fv5JrhfXqNdy8XSvdyt9lKvkHZRLtT0UzvV55WNYgM/U2fJF6vdlXGaHurh4hX6a88XS8RftF/JvbQd4i35LV3viSniTnEceb3UUzNJHCY2imHyWLmIt/mC77KDNE06QXOx+Ezq4HGOlPrKjfgeIX/JC/IW8rPSn6QzWSkdLE8TK8QD0ih5q0Z7i/yJFteo6zU7sUA7UH1Ec772Cenv2qniB+owVqozxHxBDGAP8TLfZ5P8qHaDfDl70FOaIeZwjVjFMPG1GJhYzptqN2maOFb5rnYejdhV+5F0j7hM2lP9q3qg2IoPuV56jqKOUWeqXWwWmzXHqt2VpzleTFQ/1d6lHMMMdYQyKrFGncJ/aj+SW3G4VNS+opfaXfu59t/YnVC7lIXqcYzWrlZeE8SZ0mz1A/U67QIe5Yeir9hN+rm8RPodl3KDGjiWa8UX8h6a63hGOlFqWcGjLJDekI/hKV6TQu4QV0n7yZs1R3OW/BR7SL2l06SPRJd8rHwzp4kDmCDPEuOkZY1YoJ5MpzpBfUvaRixRekvnSE/yMYdptxR7s0n8XPup+Fw6Qp3HXpq3db3LE8o46US1UzpSPVRcSTf1UOkWZZ16PPskxmg+FKPFS8psdZL6e3VnZuuarN6o9hWNdKI4UrypKeIX6t3Kb5SsjpO/zRBRxQBeFd2kZeoJdMhbiDNZIA2Svx3SbsoGuR9DlAHqIdIKZqkDpWPUPgyVbtc1XapigfZWeX+Wa0/SPEQnuysXiINIyq7SZLU3G5SRXCIvZjoHEDSd4gmZdJkmSRulb8lzNJdIh0mf8Lb0mThEPC1Olu/g/+Tx8iua8WK5tEqaKY2T75Yf5mDpffkszhGTpLO5T8yV/9Zou5gidtB+IQZIh+g6QzpUeU6axlSI9/hCuZEFynr5XPVTUdXJojBVfYZX1cv4RCwWb4tO6VppkHq9/KLyonb7hk4eVrcVl2mHM5LhymB5qLKDOopfa/vKW7FRWq8+wI/Uc5Wl0gvqntI+ykx5vTJfXam5kC919ZbeVPfS/l5MUKdKncQysVR6VTwodRNjxcfiE3GddChrNQeKn0jD5Y2a7tIasRM96SbeF/dohkqXS/vKSR4mLed0eYpmlDxGni+/JJaII4IBzOd0eYM4Ue2n7E4PzpOOV8apU+U/KsvEEmVHVosNhPRfykHic3E/76qHK9eIU6VnlLP5TDpS+4Q0R9lP7lAXZ02WPpBGspt2IR+rP2au1F19UixWt+crLuG38gvSVLFK3Zk16itMYJo6Tj1WHqLOoI96IWfyqTSXu6SXWS2uTcqZuvbVDlOHSAuZLZYxVtlTmsomzXS1uzxS2lX5lvYmthOLWCB1k06R+kuHcba2J7coywjpUSYpQ6Uz1L7qOvU3IR+gPUTaTX1IHsJf2UY7SH5Ee7k0QDtBeln5XDpQzeoIuZ/2delXyr7ierW31EPpIT+s7ZDXajvkS5UBYrTyGR+Kd6Snk9ggvyZ2kQeqF6ifaZ+VRut6T3pfnSvvrA5mC+VfuUTTTxws/4T18k+l5eIi9pAW84hYp72aB7X7CMrvpF2kHeVZbMxivNrBIO0Z6vvycsaod7JB/gc3KfPVOfJAdXt5Ksexl1rFR+IN5QaxSd1Wupc+Yh3TpJ3Ul9Tvyaeox9GhLJKnJ+UmMVbppJM3tO9yEltLY5X9xJFisRiqnCIPUx5X7lFXig71RV0/UH+m/l3qUGaznkZdzijRSsu1R6i3Sb2ZpcwM+XzlWXE7PZReTGCk/Jx6JR9L8+ij7qmdK70i9dEu5XaxnPUqTmWW1Cme106Vr1VmK39mjrhO/VpuWKd8qQ7PYm9ppjJMbcTOmifFcawQL3Ka9gWpU/tjebCYQZVu4yKJ8hJHq+9ym/Rb8U/KDaxWnpXule6QprFEmqXdX95RvT9zkLpBGqBulhaoI5WjxGB2Uo8RH0j/UPaSZ2kfkW5UB/GoukodKf1CXqFMlvvqOkPaTV0szxTvqN9Rz1POkf4otlI/UR9qxHvSAPUCaa26Vh3ADOaJL5VL5VvUVWJ/5TL5LmWhtJf6ovLvTFLf1TVY/oPyurxK3Ul8LfZWBivHiK9EH/Ux5Q7+W/5+lkbQqDer3XhZzNQ8xpZijTpNadggtlYmin7S3tqhYrh0NpdK94np0jfqZKWLbXlQHSvtI8ZJq8XVyjL+wAfKdqHZStmsLJWOVjaJk+QnlEXqSOku7Z/FoWKiSOpUcZ+6mSnSFdpDxWhlvdRfHc5K8aG4iO21R4mBjBXviKp9TVOUG7IyXj1dHsM8rpCOUjql98Uc7Qmig3nSfG5WjpA3Ko+LQQp1trSZ9WK8fDkvcSq7Sr2V0zXPqP8r7af9WmyUZmq/SuwsBmsXiCvktWovaYn6K3GetI08Vx6h/UQ5Q8xXxqlvK3+SNspdDONO9UNliKjqImW5rnmaq/gb65T/oL+4Wukjtk7SRDGRhepZyo08KQaLL7XfUQbrel2bNOOZKK8WszRna0YpSdtTfU8cSBab1OOlSfK/yP21P2Upo8VKcYN4nhPVEaHp0H4uX6D2VKvoqb1P3lb5VCxSnpdPVhaKi9go5qoTxIF0KoO4VRypvM8KsZ6HxVJldzFbPKa00mTxkLa7vEZ9OWkvV+/XdQit+j/aL8Sl2r9Ityg9xBTlWbGLOEzNylqpB1+p5/KGepM6iJflq8R08YDyQ+mX6oPSdpodxEnqwZptGSNt9/+pIKYt7f3+lgAAAABJRU5ErkJggg=='
    image_data = base64.b64decode(png_bs64_string)
    print(1122)
    with open('1.png', 'wb') as f:
        f.write(image_data)
        print(1111)

#根据response的结果和1.png生成密钥
def get_aes_key(res):
    save2png()
    global response
    response = res
    img=Image.open("1.png")
    img_array=img.load()
    width,height = img.size
    data = []
    for i in range(width):
        i_v = []
        for j in range(height):
            i_v.append(a(img_array[i,j]))
        data.append(i_v)
    os.system('rm -rf 1.png')
    return (e(base64.b64decode(response['data']), data))



download_ichunqiu.py

#-*- coding:utf-8 -*- 
import requests
from requests.packages import urllib3
from Crypto.Cipher import AES
import re
from bs4 import BeautifulSoup
from get_aes_key import get_aes_key
import os,sys
import time
import cv2
urllib3.disable_warnings()
urls=["https://www.ichunqiu.com/train/course/detail/293/66536","https://www.ichunqiu.com/train/course/detail/293/66537","https://www.ichunqiu.com/train/course/detail/293/66539","https://www.ichunqiu.com/train/course/detail/293/66540","https://www.ichunqiu.com/train/course/detail/293/66542","https://www.ichunqiu.com/train/course/detail/293/66544","https://www.ichunqiu.com/train/course/detail/293/66546","https://www.ichunqiu.com/train/course/detail/293/66547","https://www.ichunqiu.com/train/course/detail/293/66554","https://www.ichunqiu.com/train/course/detail/293/66556","https://www.ichunqiu.com/train/course/detail/293/66561","https://www.ichunqiu.com/train/course/detail/293/66627","https://www.ichunqiu.com/train/course/detail/293/66628","https://www.ichunqiu.com/train/course/detail/293/66629","https://www.ichunqiu.com/train/course/detail/293/66630","https://www.ichunqiu.com/train/course/detail/293/66632","https://www.ichunqiu.com/train/course/detail/293/66633","https://www.ichunqiu.com/train/course/detail/293/66635","https://www.ichunqiu.com/train/course/detail/293/66640","https://www.ichunqiu.com/train/course/detail/293/66642","https://www.ichunqiu.com/train/course/detail/293/66644","https://www.ichunqiu.com/train/course/detail/293/66646","https://www.ichunqiu.com/train/course/detail/293/66637","https://www.ichunqiu.com/train/course/detail/293/66647","https://www.ichunqiu.com/train/course/detail/293/66674","https://www.ichunqiu.com/train/course/detail/293/66676","https://www.ichunqiu.com/train/course/detail/293/66679","https://www.ichunqiu.com/train/course/detail/293/66681","https://www.ichunqiu.com/train/course/detail/293/66682","https://www.ichunqiu.com/train/course/detail/293/66684","https://www.ichunqiu.com/train/course/detail/293/66686","https://www.ichunqiu.com/train/course/detail/293/66698","https://www.ichunqiu.com/train/course/detail/293/66699","https://www.ichunqiu.com/train/course/detail/293/66700","https://www.ichunqiu.com/train/course/detail/293/66702","https://www.ichunqiu.com/train/course/detail/293/66704","https://www.ichunqiu.com/train/course/detail/293/66705","https://www.ichunqiu.com/train/course/detail/293/66710","https://www.ichunqiu.com/train/course/detail/293/66711","https://www.ichunqiu.com/train/course/detail/293/66712","https://www.ichunqiu.com/train/course/detail/293/66713","https://www.ichunqiu.com/train/course/detail/293/66714","https://www.ichunqiu.com/train/course/detail/293/66715","https://www.ichunqiu.com/train/course/detail/293/66716","https://www.ichunqiu.com/train/course/detail/293/66718","https://www.ichunqiu.com/train/course/detail/293/66719","https://www.ichunqiu.com/train/course/detail/293/66720","https://www.ichunqiu.com/train/course/detail/293/66721","https://www.ichunqiu.com/train/course/detail/293/66722","https://www.ichunqiu.com/train/course/detail/293/66723","https://www.ichunqiu.com/train/course/detail/293/66724","https://www.ichunqiu.com/train/course/detail/293/66725","https://www.ichunqiu.com/train/course/detail/293/66726","https://www.ichunqiu.com/train/course/detail/293/66728","https://www.ichunqiu.com/train/course/detail/293/66729","https://www.ichunqiu.com/train/course/detail/293/66731","https://www.ichunqiu.com/train/course/detail/293/66732","https://www.ichunqiu.com/train/course/detail/293/66733","https://www.ichunqiu.com/train/course/detail/293/66734","https://www.ichunqiu.com/train/course/detail/293/66735","https://www.ichunqiu.com/train/course/detail/293/66736","https://www.ichunqiu.com/train/course/detail/293/68288","https://www.ichunqiu.com/train/course/detail/293/68289","https://www.ichunqiu.com/train/course/detail/293/68290","https://www.ichunqiu.com/train/course/detail/293/68291","https://www.ichunqiu.com/train/course/detail/293/66902","https://www.ichunqiu.com/train/course/detail/293/66903","https://www.ichunqiu.com/train/course/detail/293/66904","https://www.ichunqiu.com/train/course/detail/293/66905","https://www.ichunqiu.com/train/course/detail/293/66906","https://www.ichunqiu.com/train/course/detail/293/66907","https://www.ichunqiu.com/train/course/detail/293/66908","https://www.ichunqiu.com/train/course/detail/293/66909","https://www.ichunqiu.com/train/course/detail/293/66910","https://www.ichunqiu.com/train/course/detail/293/66911","https://www.ichunqiu.com/train/course/detail/293/66912","https://www.ichunqiu.com/train/course/detail/293/66913","https://www.ichunqiu.com/train/course/detail/293/66914","https://www.ichunqiu.com/train/course/detail/293/66915","https://www.ichunqiu.com/train/course/detail/293/66916","https://www.ichunqiu.com/train/course/detail/293/66917","https://www.ichunqiu.com/train/course/detail/293/66918","https://www.ichunqiu.com/train/course/detail/293/66919","https://www.ichunqiu.com/train/course/detail/293/66920","https://www.ichunqiu.com/train/course/detail/293/66921","https://www.ichunqiu.com/train/course/detail/293/66922","https://www.ichunqiu.com/train/course/detail/293/66923","https://www.ichunqiu.com/train/course/detail/293/66924","https://www.ichunqiu.com/train/course/detail/293/66925","https://www.ichunqiu.com/train/course/detail/293/66928","https://www.ichunqiu.com/train/course/detail/293/66929","https://www.ichunqiu.com/train/course/detail/293/66930","https://www.ichunqiu.com/train/course/detail/293/66931","https://www.ichunqiu.com/train/course/detail/293/66932","https://www.ichunqiu.com/train/course/detail/293/66933","https://www.ichunqiu.com/train/course/detail/293/66935","https://www.ichunqiu.com/train/course/detail/293/66936","https://www.ichunqiu.com/train/course/detail/293/66938","https://www.ichunqiu.com/train/course/detail/293/66939","https://www.ichunqiu.com/train/course/detail/293/66940","https://www.ichunqiu.com/train/course/detail/293/66942","https://www.ichunqiu.com/train/course/detail/293/66943","https://www.ichunqiu.com/train/course/detail/293/66944","https://www.ichunqiu.com/train/course/detail/293/66945","https://www.ichunqiu.com/train/course/detail/293/66946","https://www.ichunqiu.com/train/course/detail/293/66947","https://www.ichunqiu.com/train/course/detail/293/66948","https://www.ichunqiu.com/train/course/detail/293/66949","https://www.ichunqiu.com/train/course/detail/293/66950","https://www.ichunqiu.com/train/course/detail/293/66951","https://www.ichunqiu.com/train/course/detail/293/66952","https://www.ichunqiu.com/train/course/detail/293/66953","https://www.ichunqiu.com/train/course/detail/293/66954","https://www.ichunqiu.com/train/course/detail/293/66955","https://www.ichunqiu.com/train/course/detail/293/66956","https://www.ichunqiu.com/train/course/detail/293/67121","https://www.ichunqiu.com/train/course/detail/293/67122","https://www.ichunqiu.com/train/course/detail/293/67123","https://www.ichunqiu.com/train/course/detail/293/67124","https://www.ichunqiu.com/train/course/detail/293/67125","https://www.ichunqiu.com/train/course/detail/293/67126","https://www.ichunqiu.com/train/course/detail/293/67127","https://www.ichunqiu.com/train/course/detail/293/67128","https://www.ichunqiu.com/train/course/detail/293/67129","https://www.ichunqiu.com/train/course/detail/293/67130","https://www.ichunqiu.com/train/course/detail/293/67131","https://www.ichunqiu.com/train/course/detail/293/67132","https://www.ichunqiu.com/train/course/detail/293/67133","https://www.ichunqiu.com/train/course/detail/293/67134","https://www.ichunqiu.com/train/course/detail/293/67135","https://www.ichunqiu.com/train/course/detail/293/67136","https://www.ichunqiu.com/train/course/detail/293/67137","https://www.ichunqiu.com/train/course/detail/293/67138","https://www.ichunqiu.com/train/course/detail/293/67139","https://www.ichunqiu.com/train/course/detail/293/67140","https://www.ichunqiu.com/train/course/detail/293/68465"]
#这里放每条视频的地址,一般的网课我觉得也可按上面大哥的方法来自动获取,这里我没买其他的课没法测试
#全局变量,requests请求所使用headers字段 
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0'
}

#全局变量,保存requests会话
s = requests.Session()

#用来扩展AES密钥,如果value的长度不足16个字节,在其后面填充0
def fill_character(value):
    if len(value) < 16:
        value = value.ljust(16, '\000')
    elif len(value) > 16:
        value = value[:16]
    return value

#获取视频加密所使用的AES密钥
def get_key(video_id):
    key_url= 'https://www.ichunqiu.com/video/key/%s' % video_id
    res1 = s.get(key_url,headers=headers,verify=False)
    if res1.status_code == 200:
        response = res1.json()
        #根据response中的值来生成密钥
        return get_aes_key(response)
    return ''

#根据iv和key对ts视频内容进行解密
def decrypt_single_ts(ts,iv_str,key_str):
    #将iv由十六进制字符串转化为byte
    ivs = bytes.fromhex(iv_str)
    #将密钥由十六进制字符串转化为byte
    key = bytes.fromhex(key_str)
    #计算需要填充的字节长度
    pad_len = AES.block_size - len(ts) % AES.block_size
    #若ts长度不是的AES要求的分组长度的整数倍,对其填充0
    if pad_len != AES.block_size:
        ts = ts[:-pad_len] + bytes([0] * pad_len)
    #解密操作
    cipher = AES.new(key, AES.MODE_CBC, ivs)
    out_data = cipher.decrypt(ts)
    #从解密结果中去掉填充部分,得到ts的解密内容
    if pad_len != AES.block_size:
        out_data = out_data[:-pad_len]
    return out_data 

#根据m3u8视频流的url,爬取视频内容
#m3u8_url为m3u8的地址,title为视频保存的文件名
def handle_m3u8_data(m3u8_url,title):
    res = s.get(m3u8_url,headers=headers,verify=False)
    if res.status_code == 200:
        #这是一个文本文件
        data =  res.text.strip()
        #print data
        #使用正则表达式,提取出加密方法,视频id,和iv
        aes_method,video_id,iv_str = re.findall(r'#EXT-X-KEY:METHOD=(.*?),URI="http:\/\/www.ichunqiu.com\/videokey\?vid=(\d+)",IV=0x(.*?)\n',data)[0]
        #使用正则表达式提取来ts的uri
        ts_uri_list =  re.findall(r'(\d+.ts)\n',data)
        #根据video_id获取解密密钥
        key_str = get_key(video_id)
        print("KEY:",key_str)
        print("IV:",iv_str)
        content=b''
        #下载所有的ts文件
        for ts_url in ts_uri_list:
            #构造完成的url
            url_base = m3u8_url[:m3u8_url.rfind('/')+1]
            res1 = s.get(url_base+ts_url,headers=headers,verify=False)
            if res1.status_code == 200:
                #对ts的内容进行解密,并依次拼接成一个完整的文件
                content += decrypt_single_ts(res1.content,iv_str,key_str)
        #保存输出文件
        open('%s' % title,'wb').write(content)
                
#爬虫主函数
def spider():
    global s
    #登陆url
    url = 'https://user.ichunqiu.com/login/normal'
    #登陆信息
    data = {
        'redirect_url':"https://www.ichunqiu.com/",
        'appid':'5af018bda55004e1',
        'account':'',
        'password':'',
        'captcha':'',
        'mt':'1577261775197',
        'rs':'e8c48853ab79882bc54ef7f6b5452c98'
    }
    res = s.post(url=url,data=data,headers=headers,verify=False)
    if res.status_code == 200:
        #若登陆成功
        i=1
        if res.json()['code'] == 0:
            for keshi_url in urls:
                flag=0
                if(i < 80):
                    i=i+1
                    continue
                while(flag==0):
                    try:
                        res2 = s.get(keshi_url,headers=headers,verify=False)
                        if res2.status_code == 200:
                            m3u8_url1 = re.findall(r'data-video-url="(https.*?m3u8)',res2.text)[0]
                            index = m3u8_url1.rfind('/')
                            m3u8_url = m3u8_url1[:index+1]+'720/'+m3u8_url1[index+1:]
                            print(m3u8_url)
                            
                            os.system('touch '+str(i)+'.mp4')
                            filename = '/home/ubuntu/ichunqiu/%s.mp4' % (str(i))
                            handle_m3u8_data(m3u8_url,filename)
                            print('下载完毕')
                            vid=cv2.VideoCapture(filename)
                            if not vid.isOpened():    #这边校验拿到的视频
                                print('文件损坏')
                                time.sleep(10)
                            else:
                                i=i+1
                                flag=1
                    except:
                        flag=0
                        print(i,"文件损坏、密钥损坏")
                        time.sleep(10)
                    

                            
            
spider()



放在服务器上挂了一天终于爬完了真不错

image.png

分享给朋友:

相关文章

栈迁移基础2年前 (2019-09-15)
[WP]36D杯1年前 (2020-05-02)
网络安全法2年前 (2019-09-14)
ptmalloc堆内存管理2年前 (2020-01-31)

发表评论

访客

◎欢迎参与讨论,请在这里发表您的看法和观点。