| Home | Trees | Indices | Help |
|
|---|
|
|
1
2 __doc__ = """Base classes for match providers.
3
4 They are used by business objects to give
5 phrasewheels the ability to guess phrases.
6
7 Copyright (C) GNUMed developers
8 license: GPL v2 or later
9 """
10 __author__ = "K.Hilbert <Karsten.Hilbert@gmx.net>, I.Haywood <ihaywood@gnu.org>, S.J.Tan <sjtan@bigpond.com>"
11
12 # std lib
13 import sys
14 import logging
15 import re as regex
16 import datetime as pydt
17
18
19 # GNUmed
20 if __name__ == "__main__":
21 sys.path.insert(0, '../../')
22 from Gnumed.pycommon import gmPG2
23
24
25 _log = logging.getLogger('gm.ui')
26
27
28 # these are stripped from the fragment passed to the
29 # match provider before looking for matches:
30 default_ignored_chars = "[?!.'\\(){}\[\]<>~#*$%^_]+" + '"'
31
32 # these are used to detect word boundaries which is,
33 # in turn, used to normalize word boundaries in the
34 # input fragment
35 default_word_separators = '[- \t=+&:@]+'
36 #============================================================
38 """Base class for match providing objects.
39
40 Match sources might be:
41 - database tables
42 - flat files
43 - previous input
44 - config files
45 - in-memory list created on the fly
46 """
47 print_queries = False
48 #--------------------------------------------------------
50 self.setThresholds()
51
52 self._context_vals = {}
53 self.__ignored_chars = regex.compile(default_ignored_chars)
54 # used to normalize word boundaries:
55 self.__word_separators = regex.compile(default_word_separators)
56 #--------------------------------------------------------
57 # actions
58 #--------------------------------------------------------
60 """Return matches according to aFragment and matching thresholds.
61
62 FIXME: design decision: we dont worry about data source changes
63 during the lifetime of a MatchProvider
64 FIXME: append _("*get all items*") on truncation
65 """
66 # sanity check
67 if aFragment is None:
68 raise ValueError('Cannot find matches without a fragment.')
69
70 # user explicitly wants all matches
71 if aFragment == '*':
72 return self.getAllMatches()
73
74 # case insensitivity
75 tmpFragment = aFragment.lower()
76 # remove ignored chars
77 if self.__ignored_chars is not None:
78 tmpFragment = self.__ignored_chars.sub('', tmpFragment)
79 # normalize word separators
80 if self.__word_separators is not None:
81 tmpFragment = ' '.join(self.__word_separators.split(tmpFragment))
82 # length in number of significant characters only
83 lngFragment = len(tmpFragment)
84
85 # order is important !
86 if lngFragment >= self.__threshold_substring:
87 return self.getMatchesBySubstr(tmpFragment)
88 elif lngFragment >= self.__threshold_word:
89 return self.getMatchesByWord(tmpFragment)
90 elif lngFragment >= self.__threshold_phrase:
91 return self.getMatchesByPhrase(tmpFragment)
92 else:
93 return (False, [])
94 #--------------------------------------------------------
97 #--------------------------------------------------------
100 #--------------------------------------------------------
103 #--------------------------------------------------------
106 #--------------------------------------------------------
109 #--------------------------------------------------------
110 # configuration
111 #--------------------------------------------------------
113 """Set match location thresholds.
114
115 - the fragment passed to getMatches() must contain at least this many
116 characters before it triggers a match search at:
117 1) phrase_start - start of phrase (first word)
118 2) word_start - start of any word within phrase
119 3) in_word - _inside_ any word within phrase
120 """
121 # sanity checks
122 if aSubstring < aWord:
123 _log.error('Setting substring threshold (%s) lower than word-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_substring, self.__threshold_word))
124 return False
125 if aWord < aPhrase:
126 _log.error('Setting word-start threshold (%s) lower than phrase-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_word, self.__threshold_phrase))
127 return False
128
129 # now actually reassign thresholds
130 self.__threshold_phrase = aPhrase
131 self.__threshold_word = aWord
132 self.__threshold_substring = aSubstring
133
134 return True
135 #--------------------------------------------------------
137 if word_separators is None:
138 self.__word_separators = None
139 else:
140 self.__word_separators = regex.compile(word_separators)
141
146
147 word_separators = property(_get_word_separators, _set_word_separators)
148 #--------------------------------------------------------
150 if ignored_chars is None:
151 self.__ignored_chars = None
152 else:
153 self.__ignored_chars = regex.compile(ignored_chars)
154
159
160 ignored_chars = property(_get_ignored_chars, _set_ignored_chars)
161 #--------------------------------------------------------
163 """Set value to provide context information for matches.
164
165 The matching code may ignore it depending on its exact
166 implementation. Names and values of the context depend
167 on what is being matched.
168
169 <context> -- the *placeholder* key *inside* the context
170 definition, not the context *definition* key
171 """
172 if context is None:
173 return False
174 self._context_vals[context] = val
175 return True
176 #--------------------------------------------------------
182 #------------------------------------------------------------
183 # usable instances
184 #------------------------------------------------------------
186 """Match provider where all possible options can be held
187 in a reasonably sized, pre-allocated list.
188 """
190 """aSeq must be a list of dicts. Each dict must have the keys (data, label, weight)
191 """
192 if not type(aSeq) in [type(None), list, tuple]:
193 _log.error('fixed list match provider argument must be a list/tuple of dicts/None')
194 raise TypeError('fixed list match provider argument must be a list/tuple of dicts/None')
195
196 self.__items = aSeq
197 cMatchProvider.__init__(self)
198
199 #--------------------------------------------------------
200 # internal matching algorithms
201 #
202 # if we end up here:
203 # - aFragment will not be "None"
204 # - aFragment will be lower case
205 # - we _do_ deliver matches (whether we find any is a different story)
206 #--------------------------------------------------------
208 """Return matches for aFragment at start of phrases."""
209 matches = []
210 # look for matches
211 for item in self.__items:
212 # at start of phrase, that is
213 if item['list_label'].lower().startswith(aFragment.lower()):
214 matches.append(item)
215 # no matches found
216 if len(matches) == 0:
217 return (False, [])
218
219 #matches.sort(self.__cmp_items)
220 matches.sort(key = lambda x: x['weight'], reverse = True)
221 return (True, matches)
222
223 #--------------------------------------------------------
225 """Return matches for aFragment at start of words inside phrases."""
226 matches = []
227 # look for matches
228 for item in self.__items:
229 item_label = item['list_label'].lower()
230 fragment_pos = item_label.find(aFragment.lower())
231 # found at start of phrase
232 if fragment_pos == 0:
233 matches.append(item)
234 # found as a true substring
235 elif fragment_pos > 0:
236 # but use only if substring is at start of a word
237 if item_label[fragment_pos-1] == ' ':
238 matches.append(item)
239 # no matches found
240 if len(matches) == 0:
241 return (False, [])
242
243 #matches.sort(self.__cmp_items)
244 matches.sort(key = lambda x: x['weight'], reverse = True)
245 return (True, matches)
246
247 #--------------------------------------------------------
249 """Return matches for aFragment as a true substring."""
250 matches = []
251 # look for matches
252 for item in self.__items:
253 if item['list_label'].lower().find(aFragment.lower()) != -1:
254 matches.append(item)
255 # no matches found
256 if len(matches) == 0:
257 return (False, [])
258
259 #matches.sort(self.__cmp_items)
260 matches.sort(key = lambda x: x['weight'], reverse = True)
261 return (True, matches)
262
263 #--------------------------------------------------------
265 """Return all items."""
266 matches = self.__items
267 # no matches found
268 if len(matches) == 0:
269 return (False, [])
270
271 #matches.sort(self.__cmp_items)
272 matches.sort(key = lambda x: x['weight'], reverse = True)
273 return (True, matches)
274
275 #--------------------------------------------------------
277 """items must be a list of dicts. Each dict must have the keys (data, list_label, weight)"""
278 self.__items = items
279
280 # #--------------------------------------------------------
281 # def __cmp_items(self, item1, item2):
282 # """Compare items based on weight."""
283 # if item1['weight'] == item2['weight']:
284 # return 0
285 #
286 # # do it the wrong way round to do sorting/reversing at once
287 # if item1['weight'] < item2['weight']:
288 # return 1
289 # if item1['weight'] > item2['weight']:
290 # return -1
291
292 # ===========================================================
294 """Match provider which searches matches
295 in the results of a function call.
296 """
298 """get_candidates() must return a list of strings."""
299 if get_candidates is None:
300 _log.error('must define function to retrieve match candidates list')
301 raise ValueError('must define function to retrieve match candidates list')
302
303 self._get_candidates = get_candidates
304 cMatchProvider.__init__(self)
305 #--------------------------------------------------------
306 # internal matching algorithms
307 #
308 # if we end up here:
309 # - aFragment will not be "None"
310 # - aFragment will be lower case
311 # - we _do_ deliver matches (whether we find any is a different story)
312 #--------------------------------------------------------
314 """Return matches for aFragment at start of phrases."""
315 matches = []
316 candidates = self._get_candidates()
317 # look for matches
318 for candidate in candidates:
319 # at start of phrase, that is
320 if aFragment.startswith(candidate['list_label'].lower()):
321 matches.append(candidate)
322 # no matches found
323 if len(matches) == 0:
324 return (False, [])
325
326 matches.sort(key = self.__cmp_candidates)
327 return (True, matches)
328 #--------------------------------------------------------
330 """Return matches for aFragment at start of words inside phrases."""
331 matches = []
332 candidates = self._get_candidates()
333 # look for matches
334 for candidate in candidates:
335 pos = candidate['list_label'].lower().find(aFragment)
336 # pos = string.find(string.lower(candidate['list_label']), aFragment)
337 # found as a true substring
338 # but use only if substring is at start of a word
339 # FIXME: use word seps
340 if (pos == 0) or (candidate['list_label'][pos-1] == ' '):
341 matches.append(candidate)
342 # no matches found
343 if len(matches) == 0:
344 return (False, [])
345
346 matches.sort(key = self.__cmp_candidates)
347 return (True, matches)
348 #--------------------------------------------------------
350 """Return matches for aFragment as a true substring."""
351 matches = []
352 candidates = self._get_candidates()
353 # look for matches
354 for candidate in candidates:
355 if candidate['list_label'].lower().find(aFragment) != -1:
356 # if string.find(string.lower(candidate['list_label']), aFragment) != -1:
357 matches.append(candidate)
358 # no matches found
359 if len(matches) == 0:
360 return (False, [])
361
362 matches.sort(key = self.__cmp_candidates)
363 return (True, matches)
364 #--------------------------------------------------------
368 #--------------------------------------------------------
369 #def __cmp_candidates(self, candidate1, candidate2):
373 # FIXME: do ordering
374 # if candidate1 < candidate2:
375 # return -1
376 # if candidate1 == candidate2:
377 # return 0
378 # return 1
379
380 # ===========================================================
382 """Match provider which searches matches
383 in possibly several database tables.
384
385 queries:
386 - a list of unicode strings
387 - each string is a query
388 - each string must contain: "... WHERE <column> %(fragment_condition)s ..."
389 - each string can contain in the where clause: "... %(<ctxt_key1>)s ..."
390 - each query must return (data, list_label, field_label)
391
392 context definitions to be used in the queries, example:
393 {'ctxt_key1': {'where_part': 'AND country = %(country)s', 'placeholder': 'country'}}
394
395 client code using .set_context() must use the 'placeholder':
396 <phrasewheel>/<match provider>.set_context('country', 'Germany')
397
398 full example query:
399
400 query = u" " "
401 SELECT DISTINCT ON (list_label)
402 pk_encounter
403 AS data,
404 to_char(started, 'YYYY Mon DD (HH24:MI)') || ': ' || l10n_type || ' [#' || pk_encounter || ']'
405 AS list_label,
406 to_char(started, 'YYYY Mon DD') || ': ' || l10n_type
407 AS field_label
408 FROM
409 clin.v_pat_encounters
410 WHERE
411 (
412 l10n_type %(fragment_condition)s
413 OR
414 type %(fragment_condition)s
415 ) %(ctxt_patient)s
416 ORDER BY
417 list_label
418 LIMIT
419 30
420 " " "
421 context = {'ctxt_patient': {
422 'where_part': u'AND pk_patient = %(PLACEHOLDER)s',
423 'placeholder': u'PLACEHOLDER'
424 }}
425 self.mp = gmMatchProvider.cMatchProvider_SQL2(queries = query, context = context)
426 self.set_context(context = 'PLACEHOLDER', val = '<THE VALUE>')
427
428 _SQL_data2match:
429 SQL to retrieve a match by, say, primary key
430 wherein the only keyword argument is 'pk'
431 """
433
434 cMatchProvider.__init__(self)
435
436 if type(queries) == type([]):
437 self._queries = queries
438 else:
439 self._queries = [queries]
440
441 if context is None:
442 self._context = {}
443 else:
444 self._context = context
445
446 self._args = {}
447
448 self._SQL_data2match = None
449
450 #--------------------------------------------------------
451 # internal matching algorithms
452 #
453 # if we end up here:
454 # - aFragment will not be "None"
455 # - aFragment will be lower case
456 # - we _do_ deliver matches (whether we find any is a different story)
457 #--------------------------------------------------------
459 """Return matches for aFragment at start of phrases."""
460
461 fragment_condition = "ILIKE %(fragment)s"
462 self._args['fragment'] = "%s%%" % aFragment
463
464 return self._find_matches(fragment_condition)
465
466 #--------------------------------------------------------
468 """Return matches for aFragment at start of words inside phrases."""
469
470 fragment_condition = "~* %(fragment)s"
471 aFragment = gmPG2.sanitize_pg_regex(expression = aFragment, escape_all = False)
472 self._args['fragment'] = "( %s)|(^%s)" % (aFragment, aFragment)
473
474 return self._find_matches(fragment_condition)
475
476 #--------------------------------------------------------
478 """Return matches for aFragment as a true substring."""
479
480 fragment_condition = "ILIKE %(fragment)s"
481 self._args['fragment'] = "%%%s%%" % aFragment
482
483 return self._find_matches(fragment_condition)
484
485 #--------------------------------------------------------
489
490 #--------------------------------------------------------
492 if self._SQL_data2match is None:
493 return None
494
495 query = {'cmd': self._SQL_data2match, 'args': {'pk': data}}
496 try:
497 rows, idx = gmPG2.run_ro_queries(queries = [query], get_col_idx = False)
498 except Exception:
499 _log.exception('[%s]: error running _SQL_data2match, dropping query', self.__class__.__name__)
500 self._SQL_data2match = None
501 return None
502
503 # hopefully the most frequent case:
504 if len(rows) == 1:
505 return rows[0]
506
507 _log.error('[%s]: 0 or >1 rows found by running _SQL_data2match, ambiguous, ignoring', self.__class__.__name__)
508 return None
509
510 #--------------------------------------------------------
512 """Turns retrieved database values into a list
513 of dicts fit for phrasewheel use.
514
515 This method can be overridden to massage arbitrary
516 data into the proper list of dicts.
517 """
518 matches = []
519 for row in rows:
520 # PRW wants a weight
521 match = {'weight': 0}
522 try:
523 match['data'] = row['data']
524 except KeyError:
525 match['data'] = row[0]
526 try:
527 match['list_label'] = row['list_label']
528 except KeyError:
529 match['list_label'] = row[1]
530 # explicit "field_label" in result ?
531 try:
532 match['field_label'] = row['field_label']
533 # no
534 except KeyError:
535 # but does row[2] exist ?
536 try:
537 match['field_label'] = row[2]
538 # no: reuse "list_label"
539 except IndexError:
540 match['field_label'] = match['list_label']
541 matches.append(match)
542
543 return matches
544
545 #--------------------------------------------------------
547 """Loads matching data from PostgreSQL and turns them into
548 matches fit for consumption by a phrasewheel.
549 """
550 if self.print_queries:
551 print("----------------------")
552 print(pydt.datetime.now())
553
554 matches = []
555 for query in self._queries:
556 where_fragments = {'fragment_condition': fragment_condition}
557
558 for context_key, context_def in self._context.items():
559 try:
560 placeholder = context_def['placeholder']
561 where_part = context_def['where_part']
562 self._args[placeholder] = self._context_vals[placeholder]
563 # we do have a context value for this key, so add the where condition
564 where_fragments[context_key] = where_part
565 if self.print_queries:
566 print("ctxt ph:", placeholder)
567 print("ctxt where:", where_part)
568 print("ctxt val:", self._context_vals[placeholder])
569 except KeyError:
570 # we don't have a context value for this key, so skip the where condition
571 where_fragments[context_key] = ''
572 if self.print_queries:
573 print("invalid ctxt key:", context_key)
574
575 cmd = query % where_fragments
576
577 if self.print_queries:
578 print("class:", self.__class__.__name__)
579 print("ctxt:", self._context_vals)
580 print("args:", self._args)
581 print("query:", cmd)
582
583 try:
584 rows, idx = gmPG2.run_ro_queries(queries = [{'cmd': cmd, 'args': self._args}], get_col_idx = False)
585 except gmPG2.PG_ERROR_EXCEPTION:
586 _log.exception('[%s]: error running match provider SQL, dropping query', self.__class__.__name__)
587 idx = self._queries.index(query)
588 del self._queries[idx]
589 break
590 # no matches found: try next query
591 if len(rows) == 0:
592 continue
593 matches = self._rows2matches(rows)
594 return (True, matches)
595 # none found whatsoever
596 return (False, [])
597
598 #================================================================
599 if __name__ == '__main__':
600 pass
601
| Home | Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0.1 on Sat Feb 29 02:55:27 2020 | http://epydoc.sourceforge.net |