Apache_Solr_Document.php 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410
  1. <?php
  2. /**
  3. * Copyright (c) 2007-2009, Conduit Internet Technologies, Inc.
  4. * All rights reserved.
  5. *
  6. * Redistribution and use in source and binary forms, with or without
  7. * modification, are permitted provided that the following conditions are met:
  8. *
  9. * - Redistributions of source code must retain the above copyright notice,
  10. * this list of conditions and the following disclaimer.
  11. * - Redistributions in binary form must reproduce the above copyright
  12. * notice, this list of conditions and the following disclaimer in the
  13. * documentation and/or other materials provided with the distribution.
  14. * - Neither the name of Conduit Internet Technologies, Inc. nor the names of
  15. * its contributors may be used to endorse or promote products derived from
  16. * this software without specific prior written permission.
  17. *
  18. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  19. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  20. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  21. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  22. * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  23. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  24. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  25. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  26. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  27. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  28. * POSSIBILITY OF SUCH DAMAGE.
  29. *
  30. * @copyright Copyright 2007-2009 Conduit Internet Technologies, Inc. (http://conduit-it.com)
  31. * @license New BSD (http://solr-php-client.googlecode.com/svn/trunk/COPYING)
  32. * @version $Id: Document.php 15 2009-08-04 17:53:08Z donovan.jimenez $
  33. *
  34. * @package Apache
  35. * @subpackage Solr
  36. * @author Donovan Jimenez <djimenez@conduit-it.com>
  37. */
  38. /**
  39. * Additional code Copyright (c) 2011 by Peter Wolanin, and
  40. * additional contributors.
  41. *
  42. * This program is free software; you can redistribute it and/or modify
  43. * it under the terms of the GNU General Public License as published by
  44. * the Free Software Foundation; either version 2 of the License, or (at
  45. * your option) any later version.
  46. *
  47. * This program is distributed in the hope that it will be useful, but
  48. * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  49. * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
  50. * for more details.
  51. *
  52. * You should have received a copy of the GNU General Public License
  53. * along with this program as the file LICENSE.txt; if not, please see
  54. * http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt.
  55. */
  56. /**
  57. * Holds Key / Value pairs that represent a Solr Document along with any associated boost
  58. * values. Field values can be accessed by direct dereferencing such as:
  59. *
  60. * @code
  61. * $document->title = 'Something';
  62. * echo $document->title;
  63. *
  64. * Additionally, the field values can be iterated with foreach
  65. *
  66. * @code
  67. * foreach ($document as $fieldName => $fieldValue) {
  68. * ...
  69. * }
  70. * </code>
  71. */
  72. class ApacheSolrDocument implements IteratorAggregate {
  73. /**
  74. * Document boost value
  75. *
  76. * @var float
  77. */
  78. protected $_documentBoost = FALSE;
  79. /**
  80. * Document field values, indexed by name
  81. *
  82. * @var array
  83. */
  84. protected $_fields = array();
  85. /**
  86. * Document field boost values, indexed by name
  87. *
  88. * @var array array of floats
  89. */
  90. protected $_fieldBoosts = array();
  91. /**
  92. * Clear all boosts and fields from this document
  93. */
  94. public function clear() {
  95. $this->_documentBoost = FALSE;
  96. $this->_fields = array();
  97. $this->_fieldBoosts = array();
  98. }
  99. /**
  100. * Get current document boost
  101. *
  102. * @return mixed
  103. * will be false for default, or else a float
  104. */
  105. public function getBoost() {
  106. return $this->_documentBoost;
  107. }
  108. /**
  109. * Set document boost factor
  110. *
  111. * @param mixed $boost
  112. * Use false for default boost, else cast to float that should be > 0 or will be treated as false
  113. */
  114. public function setBoost($boost) {
  115. $boost = (float) $boost;
  116. if ($boost > 0.0) {
  117. $this->_documentBoost = $boost;
  118. }
  119. else {
  120. $this->_documentBoost = FALSE;
  121. }
  122. }
  123. /**
  124. * Add a value to a multi-valued field
  125. *
  126. * NOTE: the solr XML format allows you to specify boosts
  127. * PER value even though the underlying Lucene implementation
  128. * only allows a boost per field. To remedy this, the final
  129. * field boost value will be the product of all specified boosts
  130. * on field values - this is similar to SolrJ's functionality.
  131. *
  132. * @code
  133. * $doc = new ApacheSolrDocument();
  134. * $doc->addField('foo', 'bar', 2.0);
  135. * $doc->addField('foo', 'baz', 3.0);
  136. * // resultant field boost will be 6!
  137. * echo $doc->getFieldBoost('foo');
  138. *
  139. * @param string $key
  140. * @param mixed $value
  141. * @param mixed $boost
  142. * Use false for default boost, else cast to float that should be > 0 or will be treated as false
  143. */
  144. public function addField($key, $value, $boost = FALSE) {
  145. if (!isset($this->_fields[$key])) {
  146. // create holding array if this is the first value
  147. $this->_fields[$key] = array();
  148. }
  149. else if (!is_array($this->_fields[$key])) {
  150. // move existing value into array if it is not already an array
  151. $this->_fields[$key] = array($this->_fields[$key]);
  152. }
  153. if ($this->getFieldBoost($key) === FALSE) {
  154. // boost not already set, set it now
  155. $this->setFieldBoost($key, $boost);
  156. }
  157. else if ((float) $boost > 0.0) {
  158. // multiply passed boost with current field boost - similar to SolrJ implementation
  159. $this->_fieldBoosts[$key] *= (float) $boost;
  160. }
  161. // add value to array
  162. $this->_fields[$key][] = $value;
  163. }
  164. /**
  165. * Handle the array manipulation for a multi-valued field
  166. *
  167. * @param string $key
  168. * @param string $value
  169. * @param mixed $boost
  170. * Use false for default boost, else cast to float that should be > 0 or will be treated as false
  171. *
  172. * @deprecated Use addField(...) instead
  173. */
  174. public function setMultiValue($key, $value, $boost = FALSE) {
  175. $this->addField($key, $value, $boost);
  176. }
  177. /**
  178. * Get field information
  179. *
  180. * @param string $key
  181. * @return mixed associative array of info if field exists, false otherwise
  182. */
  183. public function getField($key) {
  184. if (isset($this->_fields[$key])) {
  185. return array(
  186. 'name' => $key,
  187. 'value' => $this->_fields[$key],
  188. 'boost' => $this->getFieldBoost($key)
  189. );
  190. }
  191. return FALSE;
  192. }
  193. /**
  194. * Set a field value. Multi-valued fields should be set as arrays
  195. * or instead use the addField(...) function which will automatically
  196. * make sure the field is an array.
  197. *
  198. * @param string $key
  199. * @param mixed $value
  200. * @param mixed $boost
  201. * Use false for default boost, else cast to float that should be > 0 or will be treated as false
  202. */
  203. public function setField($key, $value, $boost = FALSE) {
  204. $this->_fields[$key] = $value;
  205. $this->setFieldBoost($key, $boost);
  206. }
  207. /**
  208. * Get the currently set field boost for a document field
  209. *
  210. * @param string $key
  211. * @return float
  212. * currently set field boost, false if one is not set
  213. */
  214. public function getFieldBoost($key) {
  215. return isset($this->_fieldBoosts[$key]) ? $this->_fieldBoosts[$key] : FALSE;
  216. }
  217. /**
  218. * Set the field boost for a document field
  219. *
  220. * @param string $key
  221. * field name for the boost
  222. * @param mixed $boost
  223. * Use false for default boost, else cast to float that should be > 0 or will be treated as false
  224. */
  225. public function setFieldBoost($key, $boost) {
  226. $boost = (float) $boost;
  227. if ($boost > 0.0) {
  228. $this->_fieldBoosts[$key] = $boost;
  229. }
  230. else {
  231. $this->_fieldBoosts[$key] = FALSE;
  232. }
  233. }
  234. /**
  235. * Return current field boosts, indexed by field name
  236. *
  237. * @return array
  238. */
  239. public function getFieldBoosts() {
  240. return $this->_fieldBoosts;
  241. }
  242. /**
  243. * Get the names of all fields in this document
  244. *
  245. * @return array
  246. */
  247. public function getFieldNames() {
  248. return array_keys($this->_fields);
  249. }
  250. /**
  251. * Get the values of all fields in this document
  252. *
  253. * @return array
  254. */
  255. public function getFieldValues() {
  256. return array_values($this->_fields);
  257. }
  258. /**
  259. * IteratorAggregate implementation function. Allows usage:
  260. *
  261. * @code
  262. * foreach ($document as $key => $value) {
  263. * ...
  264. * }
  265. *
  266. */
  267. public function getIterator() {
  268. $arrayObject = new ArrayObject($this->_fields);
  269. return $arrayObject->getIterator();
  270. }
  271. /**
  272. * Magic get for field values
  273. *
  274. * @param string $key
  275. * @return mixed
  276. */
  277. public function __get($key) {
  278. return $this->_fields[$key];
  279. }
  280. /**
  281. * Magic set for field values. Multi-valued fields should be set as arrays
  282. * or instead use the addField(...) function which will automatically
  283. * make sure the field is an array.
  284. *
  285. * @param string $key
  286. * @param mixed $value
  287. */
  288. public function __set($key, $value) {
  289. $this->setField($key, $value);
  290. }
  291. /**
  292. * Magic isset for fields values. Do not call directly. Allows usage:
  293. *
  294. * @code
  295. * isset($document->some_field);
  296. *
  297. * @param string $key
  298. * @return boolean
  299. * Whether the given key is set in the document
  300. */
  301. public function __isset($key) {
  302. return isset($this->_fields[$key]);
  303. }
  304. /**
  305. * Magic unset for field values. Do not call directly. Allows usage:
  306. *
  307. * @code
  308. * unset($document->some_field);
  309. *
  310. * @param string $key
  311. */
  312. public function __unset($key) {
  313. unset($this->_fields[$key]);
  314. unset($this->_fieldBoosts[$key]);
  315. }
  316. /**
  317. * Create an XML fragment from a ApacheSolrDocument instance appropriate for use inside a Solr add call
  318. *
  319. * @param ApacheSolrDocument $document
  320. *
  321. * @return string
  322. * an xml formatted string from the given document
  323. */
  324. public static function documentToXml(ApacheSolrDocument $document) {
  325. $xml = '<doc';
  326. if ($document->getBoost() !== FALSE) {
  327. $xml .= ' boost="' . $document->getBoost() . '"';
  328. }
  329. $xml .= '>';
  330. foreach ($document as $key => $value) {
  331. $key = htmlspecialchars($key, ENT_QUOTES, 'UTF-8');
  332. $fieldBoost = $document->getFieldBoost($key);
  333. if (is_array($value)) {
  334. foreach ($value as $multivalue) {
  335. $xml .= '<field name="' . $key . '"';
  336. if ($fieldBoost !== FALSE) {
  337. $xml .= ' boost="' . $fieldBoost . '"';
  338. // Only set the boost for the first field in the set
  339. $fieldBoost = FALSE;
  340. }
  341. $xml .= '>' . htmlspecialchars($multivalue, ENT_NOQUOTES, 'UTF-8') . '</field>';
  342. }
  343. }
  344. else {
  345. $xml .= '<field name="' . $key . '"';
  346. if ($fieldBoost !== FALSE) {
  347. $xml .= ' boost="' . $fieldBoost . '"';
  348. }
  349. $xml .= '>' . htmlspecialchars($value, ENT_NOQUOTES, 'UTF-8') . '</field>';
  350. }
  351. }
  352. $xml .= '</doc>';
  353. // Remove any control characters to avoid Solr XML parser exception
  354. return self::stripCtrlChars($xml);
  355. }
  356. /**
  357. * Replace control (non-printable) characters from string that are invalid to Solr's XML parser with a space.
  358. *
  359. * @param string $string
  360. * @return string
  361. */
  362. public static function stripCtrlChars($string) {
  363. // See: http://w3.org/International/questions/qa-forms-utf-8.html
  364. // Printable utf-8 does not include any of these chars below x7F
  365. return preg_replace('@[\x00-\x08\x0B\x0C\x0E-\x1F]@', ' ', $string);
  366. }
  367. }