python 知识星球文件下载

发布时间:2019-09-19 08:04:20编辑:auto阅读(1747)

    #!/usr/bin/python3
    # -*- coding: UTF-8 -*-
    
    import requests
    import json
    from urllib.parse import quote
    import os
    from pyquery import PyQuery as pq
    import datetime
    
    headers = {
        'Authorization': '37923FBC-C87D-454C-902D-A81DB0834605',
        'x-request-id': "73e67a6f-cf88-4c10-26da-a30441464ed5",
        'accept': "application/json, text/plain, */*",
        'host': "api.zsxq.com",
        'connection': "keep-alive",
        'referer': "https://wx.zsxq.com/dweb/",
        'user-agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
    }
    
    def readtopicurl(filename):
        with open(filename, 'r') as f:
            url = f.read()
        return url
    
    def writetopicurl(filename,url):
        try:
            with open(filename, 'w') as f:
                f.write(url)
            return True
        except:
            return False   
    
    def getDownloadURL(file_id):
        file_url = 'https://api.zsxq.com/v1.10/files/{0}/download_url'.format(file_id)
        return file_url
    
    def get_topic_list(topics_url, end_time=None):
        if end_time:
            url = topics_url + '&end_time=' + quote(end_time)
        return url
    
    def get_endtime(create_time):
        try:
            # int -1 后需要进行补 0 处理,test_str.zfill(3)
            end_time = create_time[:20]+str(int(create_time[20:23])-1).zfill(3)+create_time[23:]
            # 时间出现整点时需要特殊处理,否则会出现 -1
            if create_time[20:23] == '000':
                temp_time = datetime.datetime.strptime(create_time, "%Y-%m-%dT%H:%M:%S.%f+0800")
                temp_time += datetime.timedelta(seconds=-1)
                end_time = temp_time.strftime("%Y-%m-%dT%H:%M:%S") + '.999+0800'
                print('end_time:{0}'.format(end_time))
            return end_time
        except IndexError :
            print("error")
            return None
    
    def request_topics_url(topics_url,headers):
        topics_page = requests.get(topics_url, headers=headers)
        if topics_page.status_code == 200:
            resp_data = json.loads(topics_page.text)
            return resp_data
        else:
            return None
    
    def download_file(index, url, file_name,filedir):
        currentpath=os.getcwd()
        if not os.path.exists(filedir):
            os.mkdir(filedir)
        file_fullpath = '{0}\{1}\{2}'.format(currentpath,filedir,file_name)
        if os.path.exists(file_fullpath):
            return True
        file_res = requests.get(url)
        #print('file_res.status_code:{0}'.format(file_res.status_code))
        if file_res.status_code == 200:
            with open(file_fullpath, 'wb') as f:
                f.write(file_res.content)
                print('----第 {0}个文件:{1}下载成功!'.format(index+1,file_name))
                return True
    
        else:
            return False
    
    def download_file_url(url):
        doc = pq(requests.get(url,headers=headers).text)
        res_data = json.loads(doc('p').text()) 
        fileurl = res_data['resp_data']['download_url'] 
        return fileurl     
    
    if __name__ =="__main__":
        init_topics_url = 'https://api.zsxq.com/v1.10/groups/454548818428/files?count=20'
            #以下载老齐的读书圈为例
        urlfile = 'temp_topics_url.txt'
        filedir = '读书圈文件'
        if not os.path.exists(urlfile):
            writetopicurl(urlfile, init_topics_url)
            topic_urls = init_topics_url
        else:
            topic_urls = readtopicurl(urlfile)
    
        print('file_urls:{0}'.format(topic_urls))
    
        while True:
            resp_data = request_topics_url(topic_urls,headers)
            filelist = resp_data['resp_data']['files']
            for index , urlinfo in enumerate(filelist):
                file_id = urlinfo['file']['file_id']
                file_name = urlinfo['file']['name']
                create_time = urlinfo['file']['create_time']
                downloadurl = getDownloadURL(file_id)
                file_url = download_file_url(downloadurl)
                download_file(index,file_url, file_name,filedir)
                if index == 19:
                    end_time = get_endtime(create_time)
                    topic_urls = get_topic_list(init_topics_url,end_time)
                    writetopicurl(urlfile, topic_urls)
                    print('topic_urls:{0}'.format(topic_urls))
                    #print('end_time:'.format(end_time))
            if len(filelist) < 20:
                print('全部文件下载完成!!!')
                break
    

关键字