|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | + |
| 3 | +from __future__ import absolute_import, division, print_function, unicode_literals |
| 4 | + |
| 5 | +try: |
| 6 | + # Python 3 |
| 7 | + from urllib.request import urlopen, Request |
| 8 | +except ImportError: |
| 9 | + # Python 2 |
| 10 | + from urllib2 import urlopen, Request |
| 11 | + |
| 12 | +import re |
| 13 | +import time |
| 14 | +from collections import namedtuple |
| 15 | + |
| 16 | +import bs4 |
| 17 | +import bs4.element |
| 18 | +from bs4 import BeautifulSoup |
| 19 | + |
| 20 | +from .uri import AtWikiURI |
| 21 | + |
| 22 | +class AtWikiAPI(object): |
| 23 | + _PAGER_PATTERN = re.compile(r'.+?(\d+).+?(\d+).+?(\d+).+?') # "計 110 ページ / 1 から 100 を表示" |
| 24 | + |
| 25 | + def __init__(self, uri, **kwargs): |
| 26 | + self._uri = uri |
| 27 | + self._user_agent = kwargs.get('user_agent', 'Mozilla/5.0') |
| 28 | + self._sleep = kwargs.get('sleep', 1) |
| 29 | + |
| 30 | + def get_list(self, tag=None): |
| 31 | + index = 0 |
| 32 | + while True: |
| 33 | + count = 0 |
| 34 | + is_end = True |
| 35 | + if tag: |
| 36 | + soup = self._request(self._uri.tag(tag, index)) |
| 37 | + links = soup.find('div', attrs={'class': 'cmd_tag'}).findAll('a', href=True) |
| 38 | + is_end = False |
| 39 | + else: |
| 40 | + soup = self._request(self._uri.list('create', index)) |
| 41 | + links = soup.find('table', attrs={'class': 'pagelist'}).findAll('a', href=True, title=True) |
| 42 | + pager = soup.find('div', attrs={'class': 'pagelist'}).findAll('p')[2].text |
| 43 | + m = self._PAGER_PATTERN.search(pager) |
| 44 | + if m: |
| 45 | + (total, cursor_begin, cursor_end) = (int(m.group(1)), int(m.group(2)), int(m.group(3))) |
| 46 | + is_end = (total == cursor_end) |
| 47 | + for link in links: |
| 48 | + page_id = self._uri.get_page_id_from_uri(link.attrs['href']) |
| 49 | + page_name = link.text.strip() |
| 50 | + if page_id: |
| 51 | + count += 1 |
| 52 | + yield {'id': page_id, 'name': page_name} |
| 53 | + if count == 0 or is_end: break |
| 54 | + index += 1 |
| 55 | + time.sleep(self._sleep) |
| 56 | + |
| 57 | + def get_source(self, page_id, generation=0): |
| 58 | + soup = self._request(self._uri.backup_source(page_id, generation)) |
| 59 | + return soup.find('pre', attrs={'class': 'cmd_backup'}).text.replace('\r', '') |
| 60 | + |
| 61 | + def search(self, keyword, is_and=True): |
| 62 | + soup = self._request(self._uri.search(keyword, is_and)) |
| 63 | + lis = soup.find('div', id='wikibody').findAll('li')[:-1] # drop last item (link to http://atwiki.jp/wiki/keyword) |
| 64 | + for li in lis: |
| 65 | + a = li.find('a') |
| 66 | + name = a.text |
| 67 | + snippet = None |
| 68 | + for sib in a.next_siblings: |
| 69 | + if snippet is None: |
| 70 | + if sib.name == 'br': snippet = '' |
| 71 | + continue |
| 72 | + if isinstance(sib, bs4.element.Tag): |
| 73 | + snippet += sib.text |
| 74 | + else: |
| 75 | + snippet += str(sib) |
| 76 | + snippet = '' if snippet is None else snippet.strip() |
| 77 | + yield {'name': name, 'snippet': snippet} |
| 78 | + |
| 79 | + def _request(self, url, data=None): |
| 80 | + req = Request(url, headers={'User-Agent': self._user_agent}, data=data) |
| 81 | + return BeautifulSoup(urlopen(req).read(), 'html.parser') |
0 commit comments