don’t U θink?

なつやすみおわり．
『集合知プログラミングを参考に検索エンジン作る！』と意気込んでゆっくり作った結果がこれです．

>>> s.getcollocations("don't you")
Top 5 results of 9 for collocations of "don't you"
... don't you ? ... : 6
... don't you think ... : 4
... don't you take ... : 2
... don't you call ... : 1
... don't you write ... : 1
>>> s.getcollocations("with you", -2)
Top 5 results of 85 for collocations of "with you"
... i agree with you ... : 11
... to communicate with you ... : 10
... to share with you ... : 7
... rests entirely with you ... : 5
... rights rests with you ... : 4

問合せ単語・フレーズの前後にくっつく単語・フレーズを検索可能．試してないけどmecab使って分割したので日本語でも大丈夫なはず．ワイルドカードとか使えるようにしたら面白くなるかもなー，とは思ったけどオリンピックがもっと面白かったのでやってません．
それにしても，（プログラムの出来が悪いとはいえ）10000ページぐらいしかDBに溜めてないのに検索速度が死亡気味なんだよね．本気でやるなら分散化が必須のようです．英語を書くときに凄い便利だと思うんだけどなー，こういう検索エンジン．

以下，今回のソースコード．ベースは本のサンプルだけど結構改変してます．pythonは中々たのしかった！

searchengine.py

# -*- coding:utf-8 -*-

from BeautifulSoup import *
from ctypes import *
from pysqlite2 import dbapi2 as sqlite
from urlparse import urljoin
import os
import socket
import urllib2

class crawler:
  
  def __init__(self, dbname, libpath='C:/Program Files/MeCab/bin/libmecab.dll'):
    self.dbname = dbname
    self.con = sqlite.connect(dbname)
    self.splitter = mecab(libpath)
    socket.setdefaulttimeout(30)

  def __del__(self):
    self.con.close()

  def dbcommit(self):
    self.con.commit()

  def getentryid(self, table, field, value, createnew=True):
    cur = self.con.execute("select rowid from %s where %s = '%s'" % (table, field, value))
                  
    res = cur.fetchone()
    if res == None:
      # まだインデックスに追加されていない
      cur = self.con.execute("insert into %s (%s) values ('%s')" % (table,field,value))
      return cur.lastrowid
    else:
      return res[0]
  
  def addtoindex(self, url, soup, rawtext):
    
    text = self.gettextonly(soup) # htmlからテキスト取得
#    print '%s' % text
    words = self.separatewords(text) # 単語に分解
#    for word in words:
#      print word
      
    urlid = self.getentryid('urllist', 'url', url)
    self.write(urlid, rawtext)
        
    for i in range(len(words)):
      word = words[i]
#      if word in ignorewords: continue
      word = self.escape(word)
            
      wordid = self.getentryid('wordlist', 'word', word)
      self.con.execute("insert into wordlocation(urlid,wordid,location) values(%d,%d,%d)" % (urlid, wordid, i))

    return urlid
  
  def gettextonly(self, soup):
   
    v = soup.string 
    
    if v == None: # 文字列が持たないタグ
      c = soup.contents
      resulttext = ''
      for t in c: # このタグの子供たちを取得
        subtext = self.gettextonly(t) # 再帰的に呼び出し
        if (not subtext.isspace()) and len(subtext) > 0:
          resulttext += subtext + '\n'
      return resulttext
    
    else: # 文字列を持つタグ
      try:
        s = soup.name # タグ名
        if s == 'script' or s == 'style' or s == 'form':
           return '' # scriptなどの下はいらない
  #      print '===== %s ===' % soup.name
  #      print v.encode('utf-8').strip()
        return v.encode('utf-8').strip() # 空白を取り除く
      except: return ''
   
  def isindexed(self, url):
    # urllistテーブルをチェック
    u = self.con.execute("select rowid from urllist where url='%s'" % url).fetchone()
    if u != None:
      # wordlocationテーブルをチェック（単語登録までされたか）
      v = self.con.execute('select * from wordlocation where urlid=%d' % u[0]).fetchone()
      if v != None: return True
    return False
    
  def addlinkref(self, urlFrom, urlTo, linkText):
    words = self.separatewords(linkText)
    fromid = self.getentryid('urllist', 'url', urlFrom)
    toid = self.getentryid('urllist', 'url', urlTo)
    
    if fromid == toid: return
    
    cur = self.con.execute("insert into link(fromid,toid) values (%d,%d)" % (fromid, toid))
    
    linkid = cur.lastrowid
    for word in words:
#      if word in ignorewords: continue
      word = self.escape(word)
      wordid=self.getentryid('wordlist', 'word', word)
      self.con.execute("insert into linkwords(linkid,wordid) values (%d,%d)" % (linkid, wordid))
    
  def crawl(self, pages, depth=2):
    # 幅優先探索, depth=深さ
    
    
    for i in range(depth):
      newpages = set() # ~集合

      page_idx = 0
      for page in pages:
        
        page_idx += 1
        try:
          c = urllib2.urlopen(page) # open url
          print "(%d:%d/%d) Open %s" % (i, page_idx, len(pages), page)
        except:
          print "Could not open %s" % page
          continue
              
        try:
          rawtext = c.read()
          soup = BeautifulSoup(rawtext) # ファイル読み込み
          c.close()
        except:
          continue
        
        if not self.isindexed(page): 
          print "Indexing..."
          self.addtoindex(page, soup, rawtext) # インデックスに追加

        if i + 1 < depth: # まだクロールするなら 
          links = soup('a') # Aタグを取得
          numnewpages = len(newpages)
          for link in links:
            if 'href' in dict(link.attrs): # 属性href
              url = urljoin(page, link['href'])
              #例: urljoin('http://www.cwi.nl/%7Eguido/Python.html', 'FAQ.html')
              #     -> 'http://www.cwi.nl/%7Eguido/FAQ.html'
  
              url = self.escape(url) # ' -> ''
              url = url.split('#')[0] # アンカー削除
              if url[0:4] == 'http' and not url in pages and not self.isnotacceptted(url):
                newpages.add(url) # インデックスしていてもcrawl対象にする
                print '.',
              
              if not self.isindexed(page):                
                linkText = self.gettextonly(link)
                self.addlinkref(page, url, linkText)
  
          print '\nFound %d+%d new pages to be crawled' % (len(newpages), len(newpages)-numnewpages)
        
        self.dbcommit()

      pages = newpages 
   
  def createindextables(self):
    self.con.execute('create table urllist(url)')
    self.con.execute('create table wordlist(word)')
    self.con.execute('create table wordlocation(urlid,wordid,location)')
    self.con.execute('create table link(fromid integer,toid integer)')
    self.con.execute('create table linkwords(wordid,linkid)')

    self.con.execute('create index uUrl on urllist(url)')
    self.con.execute('create index wWord on wordlist(word)')
    self.con.execute('create index wlUrlid on wordlocation(urlid)')
    self.con.execute('create index wlWordid on wordlocation(wordid)')
    self.con.execute('create index wlLocation on wordlocation(location)')
    self.con.execute('create index wlUrlLoc on wordlocation(urlid,location)')
    self.con.execute('create index wlAll on wordlocation(urlid,wordid,location)')
    self.con.execute('create index lFromid on link(fromid)')
    self.con.execute('create index lToid on link(toid)')
    self.con.execute('create index lwWordid on linkwords(wordid)')
    self.con.execute('create index lwLinkid on linkwords(linkid)')

  def droptables(self):
    try:
      self.con.execute('drop table urllist')
      self.con.execute('drop table wordlist')
      self.con.execute('drop table wordlocation')
      self.con.execute('drop table link')
      self.con.execute('drop table linkwords')
    except:
      pass
  
  def separatewords(self, text):
    return [s.lower() for s in self.splitter.split(text) if s != '']
  
  def isnotacceptted(self, url):
    pos = url.find('?')
    if pos != -1: u = url[:pos]
    else: u = url
    
    # テキトーに拡張子チェック
    if u.endswith('mp3') or u.endswith('wav') or \
       u.endswith('txt') or u.endswith('wmv') or \
       u.endswith('flv') or u.endswith('mpg') or \
       u.endswith('rss') or u.endswith('xml') or \
       u.endswith('rdf') or u.endswith('cgi'):
       return True
    return False
  
  def escape(self,str):
    if str.find("'") != -1:
      return str.replace("'", "''")
    return str
    
  def write(self,urlid,text):
    dir = self.dbname.replace('.', '_')
    if not os.access(dir, os.F_OK):
      os.mkdir(dir)
    f = open(('%s/urlid_%d' % (dir,urlid)), "w")
    f.write(text)
    f.close()


class searcher:
  
  def __init__(self, dbname, libpath='C:/Program Files/MeCab/bin/libmecab.dll'):
    self.con = sqlite.connect(dbname)
    self.splitter = mecab(libpath)
    
  def __del__(self):
    self.con.close()
    
  def getmatchrows(self,q):
    fieldlist = 'w0.urlid'
    tablelist = ''
    clauselist = ''
    wordids =[]
    
    words = q.split(' ')
    tablenumber = 0
    
    for word in words:
      
      wordrow = self.con.execute("select rowid from wordlist where word='%s'" % word).fetchone()
      if wordrow != None:
        wordid = wordrow[0]
        wordids.append(wordid)
        
        if tablenumber > 0: # queryの単語数が2つ以上なら
          tablelist += ', '
          clauselist += ' and '
          clauselist += 'w%d.urlid = w%d.urlid and ' % (tablenumber-1,tablenumber)
          
        fieldlist += ', w%d.location' % tablenumber
        tablelist += 'wordlocation w%d' % tablenumber
        clauselist += 'w%d.wordid = %d' % (tablenumber, wordid)
        tablenumber += 1
        
    fullquery = 'select %s from %s where %s' % (fieldlist,tablelist,clauselist)
    #print fullquery
    
    cur=self.con.execute(fullquery)
    rows = [row for row in cur]
    # row -> (urlid, word0location, word1location, word2location, ...)
    # wordid -> (word0id, word1id, word2id, ...)
    return rows,wordids
          
  def getscoredlist(self, rows, wordids):
    # row -> (urlid, word0location, word1location, word2location, ...)
    
    totalscores = dict([(row[0],0) for row in rows])
    if len(rows) == 0:
      return totalscores
    
#    print 'Number of Rows: %d' % len(rows)
        
    weights = [(1.0, self.frequencyscore(rows)),
               (2.0, self.locationscore(rows)),
#               (1.0, self.distancescore(rows)),
               (2.0, self.inboundlinkscore(rows)),
               ]
    
    
    for (weight, scores) in weights:
      for url in totalscores:
        totalscores[url] += weight * scores[url]
    
    return totalscores
    
  
  def query(self, q, numrets=10):

    rows, wordids = self.getmatchrows(q)
    scores = self.getscoredlist(rows, wordids)
     
    # items()はキーと値のタプルを返す，キーでソート
    rankedscores = sorted([(score, url) for (url, score) in scores.items()], reverse=1)
    print "Top %d results of %d for '%s'" % (numrets,len(rankedscores),q)
        
    for (score, urlid) in rankedscores[0:10]:
      print '%f\t%s' % (score, self.geturlname(urlid))
  
  
  def getcollocations(self, query, numcols=1, numrets=5):
    query = query.lower()
    qwords = self.separatewords(query) # 単語に分解

    if numcols == 0: numcols = 1 # error
    
    # queryのwordidたちを取得
    qwordids = []
    for i in range(len(qwords)):
      w = qwords[i].replace("'", "''")
      r = self.con.execute("select rowid from wordlist where word='%s'" % w).fetchone()
      if r != None: 
        qwordids.append(r[0])
      else: 
        print 'No results for %s' % query
        return
  
  #  print qwordids
  
    # urlid, 位置を取得（ただし，隣接しているものだけ）
    field = 'w0.urlid, w0.location'
    table = 'wordlocation w0'
    clause = 'w0.wordid = %s' % qwordids[0]
  
    i = 1
    while i < len(qwords):
      field += ', w%d.location' % i
      table += ', wordlocation w%d' % i
      clause += ' and w%d.wordid = %s and w0.location = w%d.location-%d and w0.urlid = w%d.urlid' % (i,qwordids[i],i,i,i)
      i += 1
  
    rows = self.con.execute("select %s from %s where %s" % (field,table,clause)).fetchall()
  
    step = 1 # 増分の設定
    if (numcols < 0): step = -1
    
    # 検索語の後(前)に続く単語を調べていく
    counts = {}
    for row in rows:
      # row: (urlid, word0location, word1location, ...)
      url = row[0]
      location = row[len(row)-1] # 最終検索語の位置
      if (numcols < 0): location = row[1] # 最初の検索語の位置
  
      # 検索語の後(前)に続くnumcols個の単語のidを取得する
      wordids = []
      k = 1
      while k <= (numcols*step):
        id = self.getwordid(url, location + (k*step))
        if id != -1:
          wordids.append(id)
        k += 1
  
      # numcols個ちゃんと取得できた場合
      if len(wordids) == (numcols*step):
        if numcols > 0: key = self.listtostr(wordids) # keyを"id+1 id+2 ..."として作る
        else: key = self.listtostr(wordids, rev=True) # 逆順で作る (..., id-2, id-1)
        counts.setdefault(key, 0)
        counts[key] += 1 # 出現回数をカウント
  
    # 結果表示
    rs = 'Top %d results of %d' % (numrets,len(counts))
    if numrets > len(counts): rs = '%d results' % len(counts) 
#    print "%s for collocations of \"%s (%d results)\"" % (rs, query, len(rows))
    print "%s for collocations of \"%s\"" % (rs, query)
  
    # 出現回数でソートする
    sortedcounts = sorted([(count, id) for (id, count) in counts.items()], reverse=1)
    for (count, id) in sortedcounts[0:numrets]:
      ids = id.split(' ')
      if numcols > 0: 
        print '... %s' % query,
        for id in ids: print self.getword(id),
        print " ... :", count
      else:
        print '...', 
        for id in ids: print self.getword(id),
        print "%s ... : %d" % (query, count)
   
  def listtostr(self, list, rev=False):
    if not rev:
      s = str(list[0])
      k = 1
      while k < len(list):
        s += ' %s' % list[k]
        k += 1
    else:
      s = str(list[len(list) - 1])
      k = len(list) - 2
      while k >= 0:
        s += ' %s' % list[k]
        k -= 1
    return s

  def normalizescores(self, scores, smallIsBetter=0):
    # scores: dict
    vsmall = 0.00001
    if smallIsBetter:
      minscore = max(min(scores.values()), vsmall) 
      #minscore = min(scores.values()) # original
      return dict([(u, float(minscore)/max(vsmall, l)) for (u, l) in scores.items()])
    else:
      maxscore = max(scores.values())
      if maxscore == 0: 
        maxscore = vsmall
      return dict([(u, float(c)/maxscore) for (u, c) in scores.items()])
    
  def frequencyscore(self, rows):
    # row -> (urlid, word0location, word1location, word2location, ...)
    counts = dict([(row[0], 0) for row in rows])
    for row in rows: counts[row[0]] += 1
    return self.normalizescores(counts)
    
  def locationscore(self, rows):
    # row -> (urlid, word0location, word1location, word2location, ...)
    locations = dict([(row[0], 1000000) for row in rows]) # 大きい値で初期化
    for row in rows:
      loc = sum(row[1:]) # locationの合計
      if loc < locations[row[0]]:
        locations[row[0]] = loc # 各urlごとにloc値を取得（同じURLが複数回登場する場合あり）
    return self.normalizescores(locations, smallIsBetter=1)
  
  def distancescore(self, rows):
    # row -> (urlid, word0location, word1location, word2location, ...)
    if len(rows[0]) <= 2:
      return dict([(row[0], 1.0) for row in rows]) # 全て勝者
    
    mindistance = dict([(row[0], 1000000) for row in rows]) # 大きい値で初期化
    
    for row in rows:
      # word_i_location の単調増加が保障されていないので，単語数が多くなるとダメか？
      dist = sum([abs(row[i] - row[i-1]) for i in range(2, len(row))])
      if dist < mindistance[row[0]]:
        mindistance[row[0]] = dist
    
    return self.normalizescores(mindistance, smallIsBetter=1)
  
  def inboundlinkscore(self, rows):
    uniqueurls = set([row[0] for row in rows])
    inboundcount = dict([(u, self.con.execute('select count(*) from link where toid=%d' % u).fetchone()[0])
      for u in uniqueurls])
    return self.normalizescores(inboundcount)
  
  def geturlname(self, id):
    return self.con.execute("select url from urllist where rowid=%d" % id).fetchone()[0]
  
  def getwordid(self, urlid, location):
    id = self.con.execute("select wordid from wordlocation where urlid = %s and location = %s" % (urlid, location)).fetchone()
    if id != None:
      return id[0]
    else:
      return -1
    
  def getword(self, wordid):
    return self.con.execute("select word from wordlist where rowid = %s" % (wordid)).fetchone()[0]
    
  def getwordlist(self):
    return self.con.execute("select rowid,word from wordlist").fetchall()
  
  def separatewords(self, text):
    return [s.lower() for s in self.splitter.split(text) if s != '']

class mecab:

  def __init__(self, libpath):
    self.lib = cdll.LoadLibrary(libpath)
    self.tagger = self.lib.mecab_new(c_int(2), (c_char_p * 2)('mecab', '-Owakati'))

  def __del__(self):
    self.lib.mecab_destroy(self.tagger)
  
  def split(self, str):
    str = self.lib.mecab_sparse_tostr(self.tagger, str.encode('utf-8'))
    r = c_char_p(str).value.split(' ')
    return r[:len(r)-1] # 最後の改行を落とす

  def escape(self,str):
    if str.find("'") != -1:
      return str.replace("'", "''")
    return str

クローリング例．

import searchengine
c = searchengine.crawler('news.db')
pages = ['http://www.foxnews.com/', 'http://www.cnn.com/', 'http://www.cbsnews.com/']
c.createindextables()
c.crawl(pages,3)

検索例．

s=searchengine.searcher('news.db')
s.query('news') # 普通の検索
s.getcollocations("know")
s.getcollocations("don't you", 1)
s.getcollocations("with you", -2)