# Copyright (C) 2019 Xiaotiancai Science & Technology Co. Ltd. # All rights reserved. # Parse MAC address to int def parse_macaddr(s): return int("".join(s.split(':')), 16) # Parse timestamp def timestamp(s): from time import strptime, mktime return mktime(strptime(s, '%Y-%m-%d %H:%M:%S')) # Read input data from file, and parse it def read_data(fd): if isinstance(fd, str): with open(fd) as f: return read_data(f) from collections import namedtuple Data = namedtuple('Data', ['watch_id', 'latitude', 'longitude', 'wifi_snr', 'timestamp']) import json import csv reader = csv.DictReader(fd) data = [] for row in reader: params = json.loads(row['params']) if not 'macs' in params: continue macs = params['macs'] if macs == '': continue wifi_snr = {} for i in macs.split('|'): fields = i.split(',') macaddr = parse_macaddr(fields[0]) wifi_snr[macaddr] = float(fields[1]) point = (Data(int(row['watch_id'], 16), float(row['latitude']), float(row['longitude']), wifi_snr, timestamp(row['create_time']))) data.append(point) return data def dist(p, q): from math import sin, cos, sqrt, asin, radians lat0 = radians(p.latitude) lat1 = radians(q.latitude) lng0 = radians(p.longitude) lng1 = radians(q.longitude) dlng = abs(lng0 - lng1) dlat = abs(lat0 - lat1) hav = lambda x:sin(x/2)*sin(x/2) h = hav(dlat) + cos(lat0) * cos(lat1) * hav(dlng) return 2 * 63710000 * asin(sqrt(h)) # A trivial function, like how we detect cheating in programming contests. def best_match(known_points, new_point, similarity): max_ = None best = None for pt in known_points: if len(pt.wifi_snr) == 0: continue sim = similarity(pt, new_point) if sim is None: continue if max_ is None or sim > max_: max_ = sim best = pt return best, max_ def get_wifi_coverage(pts): from collections import namedtuple Loc = namedtuple('Loc', ['latitude', 'longitude']) mac_loc = {} mac_cov = {} for i in range(len(pts)): for mac in pts[i].wifi_snr: if not mac in mac_loc: mac_loc[mac] = [] loc = Loc(pts[i].latitude, pts[i].longitude) mac_loc[mac].append(loc) for mac in mac_loc: sum_la = 0.0 sum_lo = 0.0 for loc in mac_loc[mac]: sum_la += loc.latitude sum_lo += loc.longitude cent = Loc(sum_la / len(mac_loc[mac]), sum_lo / len(mac_loc[mac])) cov = 0 for loc in mac_loc[mac]: cov = max(cov, dist(cent, loc)) mac_cov[mac] = cov return mac_cov def select_bad_wifi(pts, crit): bad_mac = set() cov = get_wifi_coverage(pts) for mac in cov: if cov[mac] > crit: bad_mac.add(mac) return bad_mac def remove_bad_wifi(pts, bad_mac): for i in range(len(pts)): for mac in bad_mac: if mac in pts[i].wifi_snr: pts[i].wifi_snr.pop(mac) return pts def remove_bad_wifi2(pts, bad_mac): good_index = [] j = 0 for i in range(len(pts)): bad = False for mac in pts[i].wifi_snr: if mac in bad_mac: bad = True break if not bad: pts[j] = pts[i] j += 1 return pts[:j] def toplev(infile, sim, sim_crit, bad_wifi_crit = 500, stat_cnt = 10, stat_delta = 50, remove_bad_wifi_policy = 1, skip_tests_with_bad_wifi = False): from itertools import groupby from random import shuffle data = read_data(infile) # print(data) key = lambda x:x.watch_id data.sort(key = key) groups = groupby(data, key) tot = 0 matched = 0 cnt = [0] * stat_cnt for k, g in groups: points = list(g) if len(points) < 2: # print(k, "is skipped because the number of points < 2") continue # shuffle(points) points.sort(key = lambda x:x.timestamp) train = points[:len(points)//2] if not remove_bad_wifi_policy in {0, 1, 2}: raise Exception("unknown remove_bad_wifi_policy") if remove_bad_wifi_policy > 0: bad_wifi = select_bad_wifi(train, bad_wifi_crit) else: bad_wifi = {} if remove_bad_wifi_policy == 1: train = remove_bad_wifi(train, bad_wifi) if remove_bad_wifi_policy == 2: train = remove_bad_wifi2(train, bad_wifi) # print(len(train)) verify = points[len(points)//2:] tot += len(verify) for pt in verify: if len(pt.wifi_snr) == 0: continue if skip_tests_with_bad_wifi: have_bad_wifi = False for mac in pt.wifi_snr: if mac in bad_wifi: have_bad_wifi = True break if have_bad_wifi: continue pt1, sim1 = best_match(train, pt, sim) if sim1 is None or sim1 < sim_crit: # print('no match, sim =', sim(pt1, pt)) continue matched += 1 d = dist(pt1, pt) # print('d =', d) if int(d / stat_delta) < stat_cnt: cnt[int(d / stat_delta)] += 1 print('测试点总数 =', tot) print('匹配数 =', matched) acc = 0 for i in range(10): acc += cnt[i] print((i+1) * stat_delta, '米内匹配数 =', acc, ', 占测试点总数比例 =', acc / tot, ', 占所有匹配结果比例 =', acc / matched) def simple_sim(a, b): from math import sqrt up, d1, d2 = 0.0, 0.0, 0.0 for key in a.wifi_snr: d1 += a.wifi_snr[key] * a.wifi_snr[key] if key in b.wifi_snr: up += a.wifi_snr[key] * b.wifi_snr[key] for key in b.wifi_snr: d2 += b.wifi_snr[key] * b.wifi_snr[key] if up == 0 or up < 1.9: return -1 else: return up / sqrt(d1) / sqrt(d2) if __name__ == "__main__": toplev(infile = "basicdata.csv", sim = simple_sim, bad_wifi_crit = 5000, sim_crit = 0.0)