前段时间跟着某教程学做了一个简单的爬虫,从网易的实时疫情播报页面爬取疫情相关数据保存为csv表格。相关代码如下:
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 30 20:57:01 2020
@author: hecheng
"""
import requests
import pandas as pd
import time
import json
pd.set_option('max_rows',500)
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.69 Safari/537.36 Edg/81.0.416.34'
}
url = 'https://c.m.163.com/ug/api/wuhan/app/data/list-total'
r = requests.get(url, headers=headers)
print('请求状态:', r.status_code)
data_json = json.loads(r.text)
data_json.keys()
data = data_json['data']
data_province = data['areaTree'][2]['children']
def get_data(data,info_list):
info = pd.DataFrame(data)[info_list] # 主要信息
today_data = pd.DataFrame([i['today'] for i in data ]) # 生成today的数据
today_data.columns = ['today_'+i for i in today_data.columns] # 修改列名
total_data = pd.DataFrame([i['total'] for i in data ]) # 生成total的数据
total_data.columns = ['total_'+i for i in total_data.columns] # 修改列名
return pd.concat([info,total_data,today_data],axis=1) # info、today和total横向合并最终得到汇总的数据
today_province = get_data(data_province,['id','lastUpdateTime','name'])
today_province.head()
def save_data(data,name):# 定义保存数据方法
file_name = './results/'+name+'_'+time.strftime('%Y_%m_%d',time.localtime(time.time()))+'.csv'
data.to_csv(file_name,index=None,encoding='utf_8_sig')
print(file_name+' 保存成功!')
save_data(today_province,'today_province')
areaTree = data['areaTree']
today_world = get_data(areaTree,['id','lastUpdateTime','name'])
today_world.head()
save_data(today_world,'today_world')
chinaDayList = data['chinaDayList'] # 取出chinaDayList
alltime_China = get_data(chinaDayList,['date','lastUpdateTime'])
alltime_China.head()
save_data(alltime_China,'alltime_China')
province_dict = {num:name for num,name in zip(today_province['id'],today_province['name'])}
start = time.time()
for province_id in province_dict: # 遍历各省编号
try:
# 按照省编号访问每个省的数据地址,并获取json数据
url = 'https://c.m.163.com/ug/api/wuhan/app/data/list-by-area-code?areaCode='+province_id
r = requests.get(url, headers=headers)
data_json = json.loads(r.text)
# 提取各省数据,然后写入各省名称
province_data = get_data(data_json['data']['list'],['date'])
province_data['name'] = province_dict[province_id]
# 合并数据
if province_id == '420000':
alltime_province = province_data
else:
alltime_province = pd.concat([alltime_province,province_data])
print('-'*20,province_dict[province_id],'成功',
province_data.shape,alltime_province.shape,
',累计耗时:',round(time.time()-start),'-'*20)
# 设置延迟等待
time.sleep(1)
except:
print('-'*20,province_dict[province_id],'wrong','-'*20)
save_data(alltime_province,'alltime_province')
country_dict = {key:value for key,value in zip(today_world['id'], today_world['name'])}
start = time.time()
for country_id in country_dict: # 遍历每个国家的编号
try:
# 按照编号访问每个国家的数据地址,并获取json数据
url = 'https://c.m.163.com/ug/api/wuhan/app/data/list-by-area-code?areaCode='+country_id
r = requests.get(url, headers=headers)
json_data = json.loads(r.text)
# 生成每个国家的数据
country_data = get_data(json_data['data']['list'],['date'])
country_data['name'] = country_dict[country_id]
# 数据叠加
if country_id == '9577772':
alltime_world = country_data
else:
alltime_world = pd.concat([alltime_world,country_data])
print('-'*20,country_dict[country_id],'成功',country_data.shape,alltime_world.shape,
',累计耗时:',round(time.time()-start),'-'*20)
time.sleep(1)
except:
print('-'*20,country_dict[country_id],'wrong','-'*20)
save_data(alltime_world,'alltime_world')
print('爬取结束,谢谢使用!')
time.sleep(60)
0 条评论