Login

Full-Text Searchable Models

Author:
dcwatson
Posted:
February 18, 2009
Language:
Python
Version:
1.0
Score:
8 (after 8 ratings)

A drop-in module to allow for full-text searchable models with very little effort. Tested with PostgreSQL 8.3, but should work on earlier versions with the tsearch2 module installed.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
"""
Support for full-text searchable Django models using tsearch2 in PostgreSQL.

An example:

from search import SearchableModel, SearchManager
from django.db import models

class TestModel (SearchableModel):
	name = models.CharField( max_length=100 )
	description = models.TextField()
	
	# Defining a SearchManager without fields will use all CharFields and TextFields
	# objects = SearchManager()
	
	# You can pass a list of fields that should be indexed
	# objects = SearchManager( fields=('name','description') )
	
	# You may also specify fields as a dictionary, mapping each field to a weight for ranking purposes
	# see http://www.postgresql.org/docs/8.3/static/textsearch-features.html#TEXTSEARCH-MANIPULATE-TSVECTOR
	objects = SearchManager( fields={
		'name': 'A',
		'description': 'B',
	} )

# Create some test data. By default, the index field is automatically updated when save() is called.
TestModel.objects.create( name='Model One', description='Hello world, this is a test.' )
TestModel.objects.create( name='Model Two', description='Testing, testing, one two three.' )

# You can force an index update to all or some instances:
TestModel.objects.update_index()
TestModel.objects.update_index( pk=1 )
TestModel.objects.update_index( pk=[1,2] )

# Perform a search with no ranking
TestModel.objects.search( 'hello' )

# Perform a search that ranks the results, orders by the rank, and assigns the ranking
# value to the field specified by rank_field
TestModel.objects.search( 'test', rank_field='rank' )
"""

from django.db import models

class VectorField (models.Field):
	
	def __init__( self, *args, **kwargs ):
		kwargs['null'] = True
		kwargs['editable'] = False
		kwargs['serialize'] = False
		super( VectorField, self ).__init__( *args, **kwargs )
	
	def db_type( self ):
		return 'tsvector'

class SearchableModel (models.Model):
	"""
	A convience Model wrapper that provides an update_index method for object instances,
	as well as automatic index updating. The index is stored as a tsvector column on the
	model's table. A model may specify a boolean class variable, _auto_reindex, to control
	whether the index is automatically updated when save is called.
	"""
	
	search_index = VectorField()
	
	class Meta:
		abstract = True
	
	def update_index( self ):
		if hasattr( self, '_search_manager' ):
			self._search_manager.update_index( pk=self.pk )
	
	def save( self, *args, **kwargs ):
		super( SearchableModel, self ).save( *args, **kwargs )
		if hasattr( self, '_auto_reindex' ):
			if self._auto_reindex:
				self.update_index()
		else:
			self.update_index()

class SearchManager (models.Manager):
	
	def __init__( self, fields=None, config=None ):
		self.fields = fields
		self.default_weight = 'A'
		self.config = config and config or 'pg_catalog.english'
		self._vector_field_cache = None
		super( SearchManager, self ).__init__()
	
	def contribute_to_class( self, cls, name ):
		# Instances need to get to us to update their indexes.
		setattr( cls, '_search_manager', self )
		super( SearchManager, self ).contribute_to_class( cls, name )
	
	def _find_text_fields( self ):
		"""
		Return the names of all CharField and TextField fields defined for this manager's model.
		"""
		fields = [f for f in self.model._meta.fields if isinstance(f,(models.CharField,models.TextField))]
		return [f.name for f in fields]
	
	def _vector_field( self ):
		"""
		Returns the VectorField defined for this manager's model. There must be exactly one VectorField defined.
		"""
		if self._vector_field_cache is not None:
			return self._vector_field_cache
		vectors = [f for f in self.model._meta.fields if isinstance(f,VectorField)]
		if len(vectors) != 1:
			raise ValueError( "There must be exactly 1 VectorField defined for the %s model." % self.model._meta.object_name )
		self._vector_field_cache = vectors[0]
		return self._vector_field_cache
	vector_field = property( _vector_field )
	
	def _vector_sql( self, field, weight=None ):
		"""
		Returns the SQL used to build a tsvector from the given (django) field name.
		"""
		if weight is None:
			weight = self.default_weight
		f = self.model._meta.get_field( field )
		return "setweight( to_tsvector( '%s', coalesce(\"%s\",'') ), '%s' )" % (self.config, f.column, weight)
	
	def update_index( self, pk=None ):
		"""
		Updates the full-text index for one, many, or all instances of this manager's model.
		"""
		from django.db import connection
		# Build a list of SQL clauses that generate tsvectors for each specified field.
		clauses = []
		if self.fields is None:
			self.fields = self._find_text_fields()
		if isinstance( self.fields, (list,tuple) ):
			for field in self.fields:
				clauses.append( self._vector_sql(field) )
		else:
			for field, weight in self.fields.items():
				clauses.append( self._vector_sql(field,weight) )
		vector_sql = ' || '.join( clauses )
		where = ''
		# If one or more pks are specified, tack a WHERE clause onto the SQL.
		if pk is not None:
			if isinstance( pk, (list,tuple) ):
				ids = ','.join( [str(v) for v in pk] )
				where = " WHERE \"%s\" IN (%s)" % (self.model._meta.pk.column, ids)
			else:
				where = " WHERE \"%s\" = %s" % (self.model._meta.pk.column, pk)
		sql = "UPDATE \"%s\" SET \"%s\" = %s%s;" % (self.model._meta.db_table, self.vector_field.column, vector_sql, where)
		cursor = connection.cursor()
		cursor.execute( sql )
		cursor.execute( "COMMIT;" )
		cursor.close()
	
	def search( self, query, rank_field=None, rank_normalization=32 ):
		"""
		Returns a queryset after having applied the full-text search query. If rank_field
		is specified, it is the name of the field that will be put on each returned instance.
		When specifying a rank_field, the results will automatically be ordered by -rank_field.
		
		For possible rank_normalization values, refer to:
		http://www.postgresql.org/docs/8.3/static/textsearch-controls.html#TEXTSEARCH-RANKING
		"""
		ts_query = "to_tsquery('%s','%s')" % (self.config, unicode(query).replace("'","''"))
		where = "\"%s\" @@ %s" % (self.vector_field.column, ts_query)
		select = {}
		order = []
		if rank_field is not None:
			select[rank_field] = 'ts_rank( "%s", %s, %d )' % (self.vector_field.column, ts_query, rank_normalization)
			order = ['-%s' % rank_field]
		return self.all().extra( select=select, where=[where], order_by=order )

More like this

  1. Template tag - list punctuation for a list of items by shapiromatron 2 months, 2 weeks ago
  2. JSONRequestMiddleware adds a .json() method to your HttpRequests by cdcarter 2 months, 3 weeks ago
  3. Serializer factory with Django Rest Framework by julio 9 months, 2 weeks ago
  4. Image compression before saving the new model / work with JPG, PNG by Schleidens 10 months, 1 week ago
  5. Help text hyperlinks by sa2812 11 months ago

Comments

nab (on January 12, 2010):

Shouldn't "to_tsquery" be replaced with "plainto_tsquery" in the search function?

#

scott2b (on March 24, 2016):

I am inclined to agree with @nab on plainto_tsquery. Also, I am thinking the default ordering should specify ts_rank

#

Please login first before commenting.