前段时间跟着某教程学做了一个简单的爬虫,从网易的实时疫情播报页面爬取疫情相关数据保存为csv表格。相关代码如下:

# -*- coding: utf-8 -*-
"""
Created on Mon Mar 30 20:57:01 2020

@author: hecheng
"""


import requests
import pandas as pd
import time
import json
pd.set_option('max_rows',500)

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.69 Safari/537.36 Edg/81.0.416.34'
}

url = 'https://c.m.163.com/ug/api/wuhan/app/data/list-total'
r = requests.get(url, headers=headers)
print('请求状态:', r.status_code)

data_json = json.loads(r.text)
data_json.keys()
data = data_json['data']
data_province = data['areaTree'][2]['children']

def get_data(data,info_list):
    info = pd.DataFrame(data)[info_list] # 主要信息
    
    today_data = pd.DataFrame([i['today'] for i in data ]) # 生成today的数据
    today_data.columns = ['today_'+i for i in today_data.columns] # 修改列名
    
    total_data = pd.DataFrame([i['total'] for i in data ]) # 生成total的数据
    total_data.columns = ['total_'+i for i in total_data.columns] # 修改列名
    
    return pd.concat([info,total_data,today_data],axis=1) # info、today和total横向合并最终得到汇总的数据

today_province = get_data(data_province,['id','lastUpdateTime','name'])
today_province.head()

def save_data(data,name):# 定义保存数据方法
    file_name = './results/'+name+'_'+time.strftime('%Y_%m_%d',time.localtime(time.time()))+'.csv'
    data.to_csv(file_name,index=None,encoding='utf_8_sig')
    print(file_name+' 保存成功!')

save_data(today_province,'today_province')

areaTree = data['areaTree']
today_world = get_data(areaTree,['id','lastUpdateTime','name'])
today_world.head()

save_data(today_world,'today_world')

chinaDayList = data['chinaDayList'] # 取出chinaDayList
alltime_China = get_data(chinaDayList,['date','lastUpdateTime'])
alltime_China.head()

save_data(alltime_China,'alltime_China')

province_dict = {num:name for num,name in zip(today_province['id'],today_province['name'])}

start = time.time()
for province_id in province_dict: # 遍历各省编号
    
    try:
        # 按照省编号访问每个省的数据地址,并获取json数据
        url = 'https://c.m.163.com/ug/api/wuhan/app/data/list-by-area-code?areaCode='+province_id
        r = requests.get(url, headers=headers)
        data_json = json.loads(r.text)
        
        # 提取各省数据,然后写入各省名称
        province_data = get_data(data_json['data']['list'],['date'])
        province_data['name'] = province_dict[province_id]
        
        # 合并数据
        if province_id == '420000':
            alltime_province = province_data
        else:
            alltime_province = pd.concat([alltime_province,province_data])
            
        print('-'*20,province_dict[province_id],'成功',
              province_data.shape,alltime_province.shape,
              ',累计耗时:',round(time.time()-start),'-'*20)
        
        # 设置延迟等待
        time.sleep(1)
        
    except:
        print('-'*20,province_dict[province_id],'wrong','-'*20)

save_data(alltime_province,'alltime_province')

country_dict = {key:value for key,value in zip(today_world['id'], today_world['name'])}

start = time.time()
for country_id in country_dict: # 遍历每个国家的编号
    
    try:
        # 按照编号访问每个国家的数据地址,并获取json数据
        url = 'https://c.m.163.com/ug/api/wuhan/app/data/list-by-area-code?areaCode='+country_id
        r = requests.get(url, headers=headers)
        json_data = json.loads(r.text)
        
        # 生成每个国家的数据
        country_data = get_data(json_data['data']['list'],['date'])
        country_data['name'] = country_dict[country_id]

        # 数据叠加
        if country_id == '9577772':
            alltime_world = country_data
        else:
            alltime_world = pd.concat([alltime_world,country_data])
            
        print('-'*20,country_dict[country_id],'成功',country_data.shape,alltime_world.shape,
              ',累计耗时:',round(time.time()-start),'-'*20)
        
        time.sleep(1)

    except:
        print('-'*20,country_dict[country_id],'wrong','-'*20)

save_data(alltime_world,'alltime_world')

print('爬取结束,谢谢使用!')
time.sleep(60)
分类: 代码

0 条评论

发表评论

邮箱地址不会被公开。 必填项已用*标注