2020from .uri import AtWikiURI
2121
2222class AtWikiAPI (object ):
23- _PAGER_PATTERN = re .compile (r'.+?(\d+).+?( \d+).+?(\d+).+? ' ) # "計 110 ページ / 1 から 100 を表示 "
23+ _TAG_WEIGHT_PATTERN = re .compile (r'\(( \d+)\)$ ' ) # "タグ名(1) "
2424
2525 def __init__ (self , uri , ** kwargs ):
2626 self ._uri = uri
2727 self ._user_agent = kwargs .get ('user_agent' , 'Mozilla/5.0 (AtWikiPython)' )
2828 self ._sleep = kwargs .get ('sleep' , 10 )
2929
3030 def get_list (self , tag = None ):
31- index = 0
31+ index = 1
3232 while True :
3333 count = 0
34- is_end = True
3534 if tag :
3635 soup = self ._request (self ._uri .tag (tag , index ))
37- links = soup .find ('div' , attrs = {'class' : 'cmd_tag' }).findAll ( 'a' , href = True )
38- is_end = False
36+ links = soup .find ('div' , attrs = {'class' : 'cmd_tag' }).find ( 'ul' ). select ( 'a' )
37+ pager = soup . find ( 'div' , attrs = { 'class' : 'cmd_tag' }). select_one ( 'a[href$="?&p={}"]' . format ( index + 1 ))
3938 else :
4039 soup = self ._request (self ._uri .list ('create' , index ))
4140 links = soup .find ('table' , attrs = {'class' : 'pagelist' }).findAll ('a' , href = True , title = True )
42- pager = soup .find ('div' , attrs = {'class' : 'pagelist' }).findAll ('p' )[2 ].text
43- m = self ._PAGER_PATTERN .search (pager )
44- if m :
45- (total , cursor_begin , cursor_end ) = (int (m .group (1 )), int (m .group (2 )), int (m .group (3 )))
46- is_end = (total == cursor_end )
47- else :
48- is_end = True
41+ pager = soup .find ('ul' , attrs = {'class' : 'atwiki_pagination' })
42+ if pager is not None :
43+ pager = pager .select_one ('a[href$="&pp={}"]' .format (index + 1 ))
44+ is_end = (pager is None or len (links ) == 0 )
4945 for link in links :
5046 page_id = self ._uri .get_page_id_from_uri (link .attrs ['href' ])
5147 page_name = link .text .strip ()
@@ -57,32 +53,25 @@ def get_list(self, tag=None):
5753 time .sleep (self ._sleep )
5854
5955 def get_tags (self ):
60- index = 0
56+ index = 1
6157 while True :
6258 count = 0
6359 soup = self ._request (self ._uri .tag ('' , index ))
6460 links = soup .find ('div' , attrs = {'class' : 'cmd_tag' }).findAll ('a' , attrs = {'class' : 'tag' })
6561 for link in links :
6662 tag_name = link .text
6763 tag_weight = 0
68- for clazz in link .attrs ['class' ]:
69- if clazz .startswith ('weight' ):
70- tag_weight = int (clazz [6 :])
71- break
64+ m = self ._TAG_WEIGHT_PATTERN .search (link .attrs ['title' ])
65+ if m :
66+ tag_weight = int (m .group (1 ))
7267 count += 1
7368 yield {'name' : tag_name , 'weight' : tag_weight }
7469 if count == 0 : break
7570
76- pagerArea = soup .find ('div' , attrs = {'class' : 'cmd_tag' }).find ('div' )
77- if pagerArea is None :
78- # Pager area will not be shown when tag list fits in one page.
79- assert index == 0
71+ # Find "次の500件" link.
72+ pager = soup .find ('div' , attrs = {'class' : 'cmd_tag' }).select_one ('a[href$="/tag/?p={}"]' .format (index + 1 ))
73+ if not pager :
8074 break
81- pagers = pagerArea .findAll ('a' )
82- if len (pagers ) == 1 :
83- if pagers [0 ].attrs ['href' ].endswith ('/?p={}' .format (index - 1 )):
84- # Valid pager found, and no more tags.
85- break
8675 index += 1
8776 time .sleep (self ._sleep )
8877
0 commit comments