Skip to content

Commit a800196

Browse files
committed
Add initial tests
1 parent 733d0e1 commit a800196

7 files changed

Lines changed: 194 additions & 0 deletions

File tree

.gitignore

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,9 @@
1+
# Binaries and build artifacts
2+
*.pyc
3+
*.o
4+
*.so
5+
build/
6+
dist/
7+
robotstextparser.egg-info/
8+
19
/.idea

Dockerfile

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
FROM python:3.7.1
2+
3+
WORKDIR /usr/src/app
4+
#ENV PYTHONPATH /usr/src/app
5+
#COPY requirements.txt ./
6+
#RUN pip install --no-cache-dir -r requirements.txt
7+
8+
COPY . .

dev-requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
nose==1.3.7
2+
rednose==1.1.1

docker-compose.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
version: '3.7'
2+
services:
3+
python:
4+
build: .
5+
volumes:
6+
- .:/usr/src/app

setup.cfg

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
[nosetests]
2+
verbosity=2
3+
rednose=1
4+
exe=1
5+
logging-clear-handlers=1

tests/__init__.py

Whitespace-only changes.

tests/test_robots.py

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
import codecs
2+
import unittest
3+
4+
from robotstxtparser import robotstxtparser
5+
6+
7+
class RobotsTest(unittest.TestCase):
8+
9+
def test_honors_specific_agent(self):
10+
"""Honors the specific user agent if a match is found"""
11+
rp = robotstxtparser.RobotExclusionRulesParser()
12+
rp.parse('''
13+
User-agent: *
14+
Disallow: /tmp
15+
16+
User-agent: agent
17+
Allow: /tmp
18+
''')
19+
20+
self.assertTrue(rp.is_allowed('agent', 'http://example.org/tmp'))
21+
self.assertTrue(rp.is_allowed('agent', 'http://example.org/path'))
22+
23+
def test_grouping(self):
24+
"""Multiple consecutive User-Agent lines are allowed."""
25+
rp = robotstxtparser.RobotExclusionRulesParser()
26+
rp.parse('''
27+
User-agent: one
28+
User-agent: two
29+
Disallow: /tmp
30+
''')
31+
32+
self.assertFalse(rp.is_allowed('one', 'http://example.org/tmp'))
33+
self.assertFalse(rp.is_allowed('two', 'http://example.org/tmp'))
34+
self.assertTrue(rp.is_allowed('agent', 'http://example.org/tmp'))
35+
36+
def test_grouping_unknown_keys(self):
37+
"""
38+
When we encounter unknown keys, we should disregard any grouping that may have
39+
happened between user agent rules.
40+
41+
This is an example from the wild. Despite `Noindex` not being a valid directive,
42+
we'll not consider the '*' and 'ia_archiver' rules together.
43+
"""
44+
rp = robotstxtparser.RobotExclusionRulesParser()
45+
rp.parse('''
46+
User-agent: *
47+
Disallow: /content/2/
48+
User-agent: *
49+
Noindex: /gb.html
50+
Noindex: /content/2/
51+
User-agent: ia_archiver
52+
Disallow: /
53+
''')
54+
55+
self.assertTrue(rp.is_allowed('agent', 'http://example.org/foo'))
56+
self.assertTrue(rp.is_allowed('ia_archiver', 'http://example.org/bar'))
57+
58+
def test_case_insensitivity(self):
59+
"""Make sure user agent matches are case insensitive"""
60+
rp = robotstxtparser.RobotExclusionRulesParser()
61+
rp.parse('''
62+
User-agent: Agent
63+
Disallow: /path
64+
''')
65+
66+
self.assertFalse(rp.is_allowed('agent', 'http://example.org/path'))
67+
self.assertFalse(rp.is_allowed('aGeNt', 'http://example.org/path'))
68+
69+
def test_skip_malformed_line(self):
70+
"""If there is no colon in a line, then we must skip it"""
71+
rp = robotstxtparser.RobotExclusionRulesParser()
72+
rp.parse('''
73+
User-Agent: agent
74+
Disallow /no/colon/in/this/line
75+
''')
76+
77+
self.assertTrue(rp.is_allowed('agent', 'http://example.org/no/colon/in/this/line'))
78+
79+
def test_utf8_bom(self):
80+
"""If there's a utf-8 BOM, we should parse it as such"""
81+
rp = robotstxtparser.RobotExclusionRulesParser()
82+
rp.parse(codecs.BOM_UTF8 + b'''
83+
User-Agent: agent
84+
Allow: /path
85+
User-Agent: other
86+
Disallow: /path
87+
''')
88+
89+
self.assertTrue(rp.is_allowed('agent', 'http://example.org/path'))
90+
self.assertFalse(rp.is_allowed('other', 'http://example.org/path'))
91+
92+
def test_rfc_example(self):
93+
"""Tests the example provided by the RFC"""
94+
rp = robotstxtparser.RobotExclusionRulesParser()
95+
rp.parse('''
96+
# /robots.txt for http://www.fict.org/
97+
# comments to webmaster@fict.org
98+
99+
User-agent: unhipbot
100+
Disallow: /
101+
102+
User-agent: webcrawler
103+
User-agent: excite
104+
Disallow:
105+
106+
User-agent: *
107+
Disallow: /org/plans.html
108+
Allow: /org/
109+
Allow: /serv
110+
Allow: /~mak
111+
Disallow: /
112+
''')
113+
114+
# The unhipbot bot
115+
self.assertFalse(rp.is_allowed('unhipbot', 'http://example.org/'))
116+
self.assertFalse(rp.is_allowed('unhipbot', 'http://example.org/index.html'))
117+
#self.assertTrue(rp.is_allowed('unhipbot', 'http://example.org/robots.txt')) TODO
118+
self.assertFalse(rp.is_allowed('unhipbot', 'http://example.org/server.html'))
119+
self.assertFalse(rp.is_allowed('unhipbot', 'http://example.org/services/fast.html'))
120+
self.assertFalse(rp.is_allowed('unhipbot', 'http://example.org/services/slow.html'))
121+
self.assertFalse(rp.is_allowed('unhipbot', 'http://example.org/orgo.gif'))
122+
self.assertFalse(rp.is_allowed('unhipbot', 'http://example.org/org/about.html'))
123+
self.assertFalse(rp.is_allowed('unhipbot', 'http://example.org/org/plans.html'))
124+
self.assertFalse(rp.is_allowed('unhipbot', 'http://example.org/%7Ejim/jim.html'))
125+
self.assertFalse(rp.is_allowed('unhipbot', 'http://example.org/%7Emak/mak.html'))
126+
127+
# The webcrawler bot
128+
self.assertTrue(rp.is_allowed('webcrawler', 'http://example.org/'))
129+
self.assertTrue(rp.is_allowed('webcrawler', 'http://example.org/index.html'))
130+
self.assertTrue(rp.is_allowed('webcrawler', 'http://example.org/robots.txt'))
131+
self.assertTrue(rp.is_allowed('webcrawler', 'http://example.org/server.html'))
132+
self.assertTrue(rp.is_allowed('webcrawler', 'http://example.org/services/fast.html'))
133+
self.assertTrue(rp.is_allowed('webcrawler', 'http://example.org/services/slow.html'))
134+
self.assertTrue(rp.is_allowed('webcrawler', 'http://example.org/orgo.gif'))
135+
self.assertTrue(rp.is_allowed('webcrawler', 'http://example.org/org/about.html'))
136+
self.assertTrue(rp.is_allowed('webcrawler', 'http://example.org/org/plans.html'))
137+
self.assertTrue(rp.is_allowed('webcrawler', 'http://example.org/%7Ejim/jim.html'))
138+
self.assertTrue(rp.is_allowed('webcrawler', 'http://example.org/%7Emak/mak.html'))
139+
140+
# The excite bot
141+
self.assertTrue(rp.is_allowed('excite', 'http://example.org/'))
142+
self.assertTrue(rp.is_allowed('excite', 'http://example.org/index.html'))
143+
self.assertTrue(rp.is_allowed('excite', 'http://example.org/robots.txt'))
144+
self.assertTrue(rp.is_allowed('excite', 'http://example.org/server.html'))
145+
self.assertTrue(rp.is_allowed('excite', 'http://example.org/services/fast.html'))
146+
self.assertTrue(rp.is_allowed('excite', 'http://example.org/services/slow.html'))
147+
self.assertTrue(rp.is_allowed('excite', 'http://example.org/orgo.gif'))
148+
self.assertTrue(rp.is_allowed('excite', 'http://example.org/org/about.html'))
149+
self.assertTrue(rp.is_allowed('excite', 'http://example.org/org/plans.html'))
150+
self.assertTrue(rp.is_allowed('excite', 'http://example.org/%7Ejim/jim.html'))
151+
self.assertTrue(rp.is_allowed('excite', 'http://example.org/%7Emak/mak.html'))
152+
153+
# All others
154+
self.assertFalse(rp.is_allowed('anything', 'http://example.org/'))
155+
self.assertFalse(rp.is_allowed('anything', 'http://example.org/index.html'))
156+
#self.assertTrue(rp.is_allowed('anything', 'http://example.org/robots.txt')) TODO
157+
self.assertTrue(rp.is_allowed('anything', 'http://example.org/server.html'))
158+
self.assertTrue(rp.is_allowed('anything', 'http://example.org/services/fast.html'))
159+
self.assertTrue(rp.is_allowed('anything', 'http://example.org/services/slow.html'))
160+
self.assertFalse(rp.is_allowed('anything', 'http://example.org/orgo.gif'))
161+
self.assertTrue(rp.is_allowed('anything', 'http://example.org/org/about.html'))
162+
self.assertFalse(rp.is_allowed('anything', 'http://example.org/org/plans.html'))
163+
self.assertFalse(rp.is_allowed('anything', 'http://example.org/%7Ejim/jim.html'))
164+
self.assertTrue(rp.is_allowed('anything', 'http://example.org/%7Emak/mak.html'))
165+

0 commit comments

Comments
 (0)