| 1、获取网页的方法
mport urllib # GETparams = urllib.urlencode({'spam': 1, 'eggs': 2, 'bacon': 0}) f = urllib.urlopen("http://www.musi-cal.com/cgi-bin/query?%s" % params) print f.read()# POSTparams = urllib.urlencode({'spam': 1, 'eggs': 2, 'bacon': 0}) f = urllib.urlopen("http://www.musi-cal.com/cgi-bin/query", params) print f.read()
2、Cookie
import cookielib,urllib2cookie = cookielib.CookieJar()opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie ))urllib2.install_opener(opener)urllib2.urlopen( url_string, param )
3、保存变量到文件,并从文件恢复
from pickle import dump,load# dumpfp = open( 'userlist.pck', 'w')dump( user_list,fp )fp.close()
# loadfp = open( 'userlist.pck', 'r')user_list = load(fp)fp.close()
4、正则表达式
.*? 为非贪婪匹配模式,而.*为贪婪匹配模式() 用来匹配时分组
import recontent = re.search(r'友情链接.*?</tr>.*?<tr>(.*?)</tr>',\ page,re.M|re.S|re.I).group(1)links = re.compile( r'href=\'(.*)\'' ).findall( content )
5、意外处理
try: ...except KeyboardInterrupt:else: raise Exception |