123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166 |
- # Copyright (C) 2019 Xiaotiancai Science & Technology Co. Ltd.
- # All rights reserved.
- # Parse MAC address to int
- def parse_macaddr(s):
- return int("".join(s.split(':')), 16)
- # Parse timestamp
- def timestamp(s):
- from time import strptime, mktime
- return mktime(strptime(s, '%Y-%m-%d %H:%M:%S'))
- # Read input data from file, and parse it
- def read_data(fd):
- if isinstance(fd, str):
- with open(fd) as f:
- return read_data(f)
- from collections import namedtuple
- Data = namedtuple('Data',
- ['watch_id', 'latitude', 'longitude', 'wifi_snr', 'timestamp'])
- import json
- import csv
- reader = csv.DictReader(fd)
- data = []
- for row in reader:
- params = json.loads(row['params'])
- if not 'macs' in params:
- continue
- macs = params['macs']
- if macs == '':
- continue
- wifi_snr = {}
- for i in macs.split('|'):
- fields = i.split(',')
- macaddr = parse_macaddr(fields[0])
- wifi_snr[macaddr] = float(fields[1])
- point = (Data(int(row['watch_id'], 16), float(row['latitude']),
- float(row['longitude']), wifi_snr, timestamp(row['create_time'])))
- data.append(point)
- return data
- def dist(p, q):
- from math import sin, cos, sqrt, asin, radians
- lat0 = radians(p.latitude)
- lat1 = radians(q.latitude)
- lng0 = radians(p.longitude)
- lng1 = radians(q.longitude)
- dlng = abs(lng0 - lng1)
- dlat = abs(lat0 - lat1)
- hav = lambda x:sin(x/2)*sin(x/2)
- h = hav(dlat) + cos(lat0) * cos(lat1) * hav(dlng)
- return 2 * 63710000 * asin(sqrt(h))
- # A trivial function, like how we detect cheating in programming contests.
- def best_match(known_points, new_point, similarity):
- max_ = None
- best = None
- for pt in known_points:
- if len(pt.wifi_snr) == 0:
- continue
- sim = similarity(pt, new_point)
- if sim is None:
- continue
- if max_ is None or sim > max_:
- max_ = sim
- best = pt
- return best, max_
- def remove_bad_wifi(pts, crit = 500):
- from collections import namedtuple
- Loc = namedtuple('Loc', ['latitude', 'longitude'])
- mac_loc = {}
- bad_mac = set()
- for i in range(len(pts)):
- for mac in pts[i].wifi_snr:
- if not mac in mac_loc:
- mac_loc[mac] = []
- loc = Loc(pts[i].latitude, pts[i].longitude)
- """
- for loc1 in mac_loc[mac]:
- if dist(loc1, loc) > crit:
- print(loc1, loc, dist(loc1, loc))
- bad_mac.add(mac)
- break
- """
- mac_loc[mac].append(loc)
- for mac in mac_loc:
- sum_la = 0.0
- sum_lo = 0.0
- for loc in mac_loc[mac]:
- sum_la += loc.latitude
- sum_lo += loc.longitude
- cent = Loc(sum_la / len(mac_loc[mac]),
- sum_lo / len(mac_loc[mac]))
- for loc in mac_loc[mac]:
- if dist(cent, loc) > crit:
- bad_mac.add(mac)
- break
- for i in range(len(pts)):
- for mac in bad_mac:
- if mac in pts[i].wifi_snr:
- pts[i].wifi_snr.pop(mac)
- def toplev(infile, sim, bad_wifi_crit, sim_crit, stat_cnt = 10,
- stat_delta = 50):
- from itertools import groupby
- from random import shuffle
- data = read_data(infile)
- # print(data)
- key = lambda x:x.watch_id
- data.sort(key = key)
- groups = groupby(data, key)
- tot = 0
- matched = 0
- cnt = [0] * stat_cnt
- for k, g in groups:
- points = list(g)
- if len(points) < 2:
- # print(k, "is skipped because the number of points < 2")
- continue
- # shuffle(points)
- points.sort(key = lambda x:x.timestamp)
- train = points[:len(points)//2]
- remove_bad_wifi(train, bad_wifi_crit)
- # print(len(train))
- verify = points[len(points)//2:]
- tot += len(verify)
- for pt in verify:
- if len(pt.wifi_snr) == 0:
- continue
- pt1, sim1 = best_match(train, pt, sim)
- if sim1 is None or sim1 < sim_crit:
- # print('no match, sim =', sim(pt1, pt))
- continue
- matched += 1
- d = dist(pt1, pt)
- # print('d =', d)
- if int(d / stat_delta) < stat_cnt:
- cnt[int(d / stat_delta)] += 1
- print('测试点总数 =', tot)
- print('匹配数 =', matched)
- acc = 0
- for i in range(10):
- acc += cnt[i]
- print((i+1) * stat_delta, '米内匹配数 =', acc,
- ', 占测试点总数比例 =', acc / tot, ', 占所有匹配结果比例 =',
- acc / matched)
- def simple_sim(a, b):
- from math import sqrt
- up, d1, d2 = 0.0, 0.0, 0.0
- for key in a.wifi_snr:
- d1 += a.wifi_snr[key] * a.wifi_snr[key]
- if key in b.wifi_snr:
- up += a.wifi_snr[key] * b.wifi_snr[key]
- for key in b.wifi_snr:
- d2 += b.wifi_snr[key] * b.wifi_snr[key]
- if up == 0 or up < 1.9:
- return -1
- else:
- return up / sqrt(d1) / sqrt(d2)
- if __name__ == "__main__":
- toplev(infile = "basicdata.csv", sim = simple_sim, bad_wifi_crit = 5000,
- sim_crit = 0.0)
|