# -*- coding: utf-8 -*- import logging import os import sys from common.mysql_uitl import fetch_all, save_result, insert_batch, insert, save_etl_log from common.time_util import now, now_str, YMDHMS_FORMAT ''' 用户观看直播明细数据统计,定时任务,每2个小时运行一次 先上线,如果要用访问线索,需要调整这里的逻辑,包括(dws_user_share_event, dws_user_visitor_clues) ''' logging.basicConfig(format="%(asctime)s %(name)s:%(levelname)s:%(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO) file_name = sys.argv[0] task_file = os.path.split(__file__)[-1].split(".")[0] def dws_user_watch_live(data_dt): share_live_dict = query_dws_user_share_live_num(data_dt) watch_live_dict = query_dws_user_watch_live(data_dt) merge_live_result_dict = merge_live_dict(watch_live_dict, share_live_dict) row = save_result('tamp_data_dws', 'dws_user_watch_live', merge_live_result_dict, file_name) now_time = now_str(YMDHMS_FORMAT) save_etl_log('tamp_data_dws', 'dws_user_watch_live', data_dt, row, 'done', task_file, now_time) # 加依赖关系,也需要自动配置 # 分享直播 def query_dws_user_share_live_num(data_dt): function_name = sys._getframe().f_code.co_name logging.info(f'{function_name} start') sql = ''' select p.data_dt ,p.source_user_id as user_id ,p.res_id ,t.zt_name as res_name ,t.real_duration_second as res_dur # ,coalesce(t.real_duration_second, 0) as res_dur ,0 as learn_dur ,0.0 as play_rate ,1000 as event_type -- 未观看 ,'未观看' as watch_type ,t.room_type ,if(t.room_type = 1, '母直播间', '子直播间') as live_type ,t.zt_starttime as live_start ,t.zt_endtime as live_end ,min(p.local_time) as start_time ,max(p.local_time) as end_time -- 为了避免,因为分享多次,就是多条数据 ,count(p.res_id) as share_num from tamp_data_dwd.dwd_user_share_event p left join tamp_zhibo.zhibo_theme t on p.res_id = t.id where p.data_dt = %s and p.event_type = '1008' group by p.source_user_id, p.res_id ''' share_live_dict = fetch_all(sql, data_dt) logging.info(f'{function_name} success') return share_live_dict def query_dws_user_watch_live(data_dt): function_name = sys._getframe().f_code.co_name logging.info(f'{function_name} start') sql = f''' select data_dt ,user_id ,res_id ,res_name ,coalesce(res_dur,0) as res_dur ,learn_dur ,coalesce(play_rate, 0) as play_rate ,event_type ,watch_type ,room_type ,live_type ,live_start ,live_end ,start_time ,end_time from tamp_data_dwd.dwd_user_watch_live where data_dt = %s ''' watch_live_dict = fetch_all(sql, data_dt) logging.info(f'{function_name} success') return watch_live_dict def merge_live_dict(watch_live_dict, share_live_dict): function_name = sys._getframe().f_code.co_name logging.info(f'{function_name} start') merge_live_tmp = list() merge_live_result = list() # 用户观看了直播 if watch_live_dict: for x in watch_live_dict: for y in share_live_dict: # 用户观看并且分享了直播 if x['res_id'] == y['res_id'] and x['user_id'] == y['user_id']: x['share_num'] = y['share_num'] merge_live_tmp.append(x) else: # merge_live_tmp.append(x) x['share_num'] = 0 merge_live_tmp.append(x) # 用户没有观看直播,有分享直播 else: print(share_live_dict) logging.info(f'{function_name} start') for x in share_live_dict: merge_live_tmp.append(x) # 去重 for i in merge_live_tmp: if i not in merge_live_result: merge_live_result.append(i) logging.info(f'{function_name} success') return merge_live_result # # def save_etl_log(row, data_dt): # sql = '''insert into tamp_data_dwd.dwd_etl_log (data_dt, ) # ''' if __name__ == '__main__': import datetime begin = datetime.date(2021, 9, 21) end = datetime.date(2021, 9, 22) data_dt = begin delta = datetime.timedelta(days=1) while data_dt <= end: print(data_dt.strftime("%Y-%m-%d")) dws_user_watch_live(data_dt) data_dt += delta